<a href="https://colab.research.google.com/github/SAIROHITH-16/Video-Caption-Generator-using-DL/blob/main/Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
c!pip install transformers accelerate torch torchvision pillow nltk tqdm matplotlib opencv-python -q

In [None]:
import cv2
import torch
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from torchvision import transforms
from PIL import Image
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import pandas as pd

nltk.download('punkt', quiet=True)

In [None]:
def extract_frames(video_path, frame_skip=20):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_skip == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        frame_count += 1
    cap.release()
    print(f"âœ… Extracted {len(frames)} frames")
    return frames

from google.colab import files
uploaded = files.upload()
video_path = list(uploaded.keys())[0]
frames = extract_frames(video_path, frame_skip=15)

In [None]:
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

def preprocess_frames(frames):
    preprocessed = []
    for frame in frames:
        img = Image.fromarray(frame)
        img = preprocess(img)
        preprocessed.append(img)
    return preprocessed

preprocessed_frames = preprocess_frames(frames)
print(f"âœ… Preprocessing complete! {len(preprocessed_frames)} frames ready.")


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
print("âœ… Model loaded successfully.")

In [None]:
num_display = min(6, len(frames))
plt.figure(figsize=(15, 6))
for i in range(num_display):
    plt.subplot(2, 3, i+1)
    plt.imshow(frames[i])
    plt.axis("off")
    plt.title(f"Frame {i+1}")
plt.suptitle("Extracted Frames from the Video", fontsize=14)
plt
plt.show()

In [None]:
captions = []
for frame in tqdm(frames[:num_display]):
    pixel_values = feature_extractor(images=frame, return_tensors="pt").pixel_values.to(device)
    output_ids = model.generate(pixel_values, max_length=20, num_beams=4)
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    captions.append(caption)

print("âœ… Caption generation complete!")

In [None]:
plt.figure(figsize=(15, 8))
for i in range(num_display):
    plt.subplot(2, 3, i+1)
    plt.imshow(frames[i])
    plt.axis("off")
    plt.title(captions[i], fontsize=10, wrap=True)
plt.suptitle("Frames with Generated Captions", fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
references = [[cap.split()] for cap in captions]
hypotheses = [cap.split() for cap in captions]

smoothing = SmoothingFunction().method1
bleu_scores = [sentence_bleu(ref, hyp, smoothing_function=smoothing) for ref, hyp in zip(references, hypotheses)]
avg_bleu = corpus_bleu(references, hypotheses, smoothing_function=smoothing)

performance_matrix = pd.DataFrame({
    "Frame No": [i+1 for i in range(len(captions))],
    "Generated Caption": captions,
    "BLEU Score": bleu_scores
})

print("=== ðŸ§¾ PERFORMANCE MATRIX ===")
print(performance_matrix)
print("\nAverage BLEU Score:", round(avg_bleu, 3))

In [None]:
joined = " ".join(captions).lower()
keywords = ["people", "car", "nature", "food", "animal", "sports", "technology"]
topic = [k for k in keywords if k in joined]
summary = topic[0].capitalize() if topic else "General activity"
print("\nðŸŽ¬ Final Video Topic Detected:", summary)