In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import cv2
import os
import numpy as np

def extract_frames(video_path, output_folder, num_frames):
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Open the video file
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return
    
    # Get total frame count
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames == 0:
        print("Error: No frames found in video.")
        return
    
    # Determine frame indices to extract
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    
    count = 0
    for i in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if not ret:
            print(f"Warning: Could not read frame {i}")
            continue
        
        frame_filename = os.path.join(output_folder, f"frame_{count:04d}.jpg")
        cv2.imwrite(frame_filename, frame)
        count += 1
        
    cap.release()
    print(f"Successfully extracted {count} frames and saved to {output_folder}")

# usage
video_path = r"C:\Users\vamsh\Desktop\People Walking.mp4"  
output_folder = r"VideoCaptioningFrames"
num_frames = 10  # Desired number of frames

extract_frames(video_path, output_folder, num_frames)


Successfully extracted 10 frames and saved to VideoCaptioningFrames


In [5]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os
import warnings

warnings.filterwarnings('ignore')

# Load the processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", force_download=True, resume_download=True)
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", force_download=True, resume_download=True)

def generate_captions(image_folder, output_folder):
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all jpg images from the folder
    image_files = [f for f in os.listdir(image_folder) if f.endswith(".jpg")]
    
    for image_file in image_files:
        image_path = os.path.join(image_folder, image_file)
        image = Image.open(image_path)
        
        # Preprocess the image
        inputs = processor(images=image, return_tensors="pt")
        
        # Generate a caption for the image
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
        
        # Save the caption to a text file
        caption_filename = os.path.join(output_folder, f"{os.path.splitext(image_file)[0]}.txt")
        with open(caption_filename, "w") as f:
            f.write(caption)
        
        print(f"Caption generated for {image_file}: {caption}")

# Example usage
image_folder = r"VideoCaptioningFrames"
output_folder = "VideoCaptions" 

generate_captions(image_folder, output_folder)


Caption generated for frame_0000.jpg: a group of people walking around a large white floor
Caption generated for frame_0001.jpg: a group of people walking around a large white floor
Caption generated for frame_0002.jpg: a group of people walking around a large white floor
Caption generated for frame_0003.jpg: a group of people walking around a large white floor
Caption generated for frame_0004.jpg: a group of people walking around a large white building
Caption generated for frame_0005.jpg: a group of people walking around a large white building
Caption generated for frame_0006.jpg: a group of people walking around a large white building
Caption generated for frame_0007.jpg: people walking around a large white floor
Caption generated for frame_0008.jpg: a group of people walking around a large white building
Caption generated for frame_0009.jpg: a group of people walking around a large white floor
