In [1]:
import cv2
import os
import numpy as np

from transformers import BlipProcessor, BlipForConditionalGeneration
# from fairseq.models.ofa import OFATokenizer, OFAModel
from PIL import Image
import os
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Base path for the project
base_path = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/NLPVideoDescription/Videos/'
# Change the video number and name dynamically
video_number = 'Video-04' 
video_name = 'Football_ USC 42, Rutgers 20 - Highlights (10_25_24).mp4'  # Set the video name

In [19]:
# Function to extract frames from a video
def extract_frames(video_path, output_folder, interval=1):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Open the video file
    cap = cv2.VideoCapture(video_path)
    
    # Check if the video opened successfully
    if not cap.isOpened():
        print("Error opening video file")
        return

    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Save the frame at the specified interval
        if frame_count % interval == 0:
            frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_filename, frame)
        
        frame_count += 1

    # Release the video capture object
    cap.release()
    print(f"Extracted {frame_count} frames.")


In [20]:
# video_path = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/First Flying Lesson _ Harry Potter and the Sorcerer s Stone.mp4'  # Replace with your video path
video_path = f'{base_path}/{video_number}/{video_name}'
# output_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Frames'  # Folder to save frames
output_folder = f'{base_path}/{video_number}/Frames'
interval = 10  # Save every 10th frame
extract_frames(video_path, output_folder, interval)

Extracted 5373 frames.


In [21]:
# Function to extract keyframes from the extracted frames with frame sampling
def extract_keyframes(frames_folder, keyframes_output_folder, threshold=100, sampling_interval=5):
    # Create output folder if it doesn't exist
    if not os.path.exists(keyframes_output_folder):
        os.makedirs(keyframes_output_folder)

    # List all frames in the frames folder
    frame_filenames = sorted(os.listdir(frames_folder), key=lambda x: int(x.split('_')[1].split('.')[0]))

    prev_frame = None
    keyframe_count = 0

    for idx, frame_filename in enumerate(frame_filenames):
        # Process only every nth frame
        if idx % sampling_interval != 0:
            continue

        frame_path = os.path.join(frames_folder, frame_filename)
        curr_frame = cv2.imread(frame_path)

        if prev_frame is None:
            prev_frame = curr_frame
            continue

        # Calculate the absolute difference between the current and previous frame
        frame_diff = cv2.absdiff(prev_frame, curr_frame)
        gray_diff = cv2.cvtColor(frame_diff, cv2.COLOR_BGR2GRAY)

        # Threshold the difference to get a binary image
        _, thresh = cv2.threshold(gray_diff, 30, 255, cv2.THRESH_BINARY)

        # Count the number of non-zero pixels in the thresholded image
        non_zero_count = cv2.countNonZero(thresh)

        # If the number of different pixels exceeds the threshold, save the frame as a keyframe
        if non_zero_count > threshold:
            keyframe_filename = os.path.join(keyframes_output_folder, f"keyframe_{keyframe_count}.jpg")
            cv2.imwrite(keyframe_filename, curr_frame)
            keyframe_count += 1

        prev_frame = curr_frame

    print(f"Extracted {keyframe_count} keyframes.")

In [22]:
# Difference method to extract keyframes

# # frames_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Frames'  # Folder with extracted frames
# frames_folder = f'{base_path}/{video_number}/Frames'  # Folder with extracted frames
# # keyframes_output_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Key-frames'  # Folder to save keyframes
# keyframes_output_folder = f'{base_path}/{video_number}/Key-frames'
# extract_keyframes(frames_folder, keyframes_output_folder, threshold=100, sampling_interval=5)

In [23]:
# Function to extract keyframes using optical flow
def extract_keyframes_optical_flow(frames_folder, keyframes_output_folder, flow_threshold=0.5, sampling_interval=5):
    # Create output folder if it doesn't exist
    if not os.path.exists(keyframes_output_folder):
        os.makedirs(keyframes_output_folder)

    # List all frames in the frames folder
    frame_filenames = sorted(os.listdir(frames_folder), key=lambda x: int(x.split('_')[1].split('.')[0]))

    prev_frame_gray = None
    keyframe_count = 0

    for idx, frame_filename in enumerate(frame_filenames):
        # Process only every nth frame
        if idx % sampling_interval != 0:
            continue

        frame_path = os.path.join(frames_folder, frame_filename)
        curr_frame = cv2.imread(frame_path)

        # Convert the current frame to grayscale for optical flow calculation
        curr_frame_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)

        # If this is the first frame, skip optical flow calculation
        if prev_frame_gray is None:
            prev_frame_gray = curr_frame_gray
            continue

        # Calculate dense optical flow using the Farneback method
        flow = cv2.calcOpticalFlowFarneback(prev_frame_gray, curr_frame_gray, None, 
                                            0.5, 3, 15, 3, 5, 1.2, 0)

        # Compute the magnitude and angle of 2D vectors
        magnitude, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])

        # Calculate the mean motion magnitude
        mean_magnitude = np.mean(magnitude)

        # If the mean motion exceeds the threshold, save the frame as a keyframe
        if mean_magnitude > flow_threshold:
            keyframe_filename = os.path.join(keyframes_output_folder, f"keyframe_{keyframe_count}.jpg")
            cv2.imwrite(keyframe_filename, curr_frame)
            keyframe_count += 1

        prev_frame_gray = curr_frame_gray

    print(f"Extracted {keyframe_count} keyframes using optical flow.")



In [24]:
# frames_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Frames'  # Folder with extracted frames
frames_folder = f'{base_path}/{video_number}/Frames'
# keyframes_output_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Key-frames'  # Folder to save keyframes
keyframes_output_folder = f'{base_path}/{video_number}/Key-frames'
extract_keyframes_optical_flow(frames_folder, keyframes_output_folder, flow_threshold=0.5, sampling_interval=5)

Extracted 107 keyframes using optical flow.


In [26]:
# Initialize BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Function to generate descriptions for all keyframes
def generate_keyframe_descriptions(keyframes_folder, descriptions_output_file):
    # List all keyframes in the keyframes folder
    keyframe_filenames = sorted(os.listdir(keyframes_folder), key=lambda x: int(x.split('_')[1].split('.')[0]))

    descriptions = {}  # To store keyframe descriptions

    for keyframe_filename in keyframe_filenames:
        keyframe_path = os.path.join(keyframes_folder, keyframe_filename)
        image = Image.open(keyframe_path)

        # Preprocess and generate caption for the current keyframe
        inputs = processor(image, return_tensors="pt")
        out = model.generate(**inputs)

        # Decode the tokenized output to get the caption
        caption = processor.decode(out[0], skip_special_tokens=True)
        descriptions[keyframe_filename] = caption
        print(f"Generated Caption for {keyframe_filename}: {caption}")

    # Save descriptions to a file for later use
    with open(descriptions_output_file, 'w') as f:
        for keyframe_filename, caption in descriptions.items():
            f.write(f"{keyframe_filename}: {caption}\n")

    print(f"Descriptions for {len(keyframe_filenames)} keyframes saved to {descriptions_output_file}")

In [27]:
# Paths for keyframes and output descriptions file
# keyframes_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Key-frames'  # Folder where keyframes are saved
keyframes_folder = f'{base_path}/{video_number}/Key-frames'
# descriptions_output_file = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Descriptions/descriptions_blip.txt'  # File to save the descriptions
descriptions_output_file = f'{base_path}/{video_number}/Descriptions/descriptions_blip.txt'

# Generate descriptions for all keyframes
generate_keyframe_descriptions(keyframes_folder, descriptions_output_file)

Generated Caption for keyframe_0.jpg: a football game with a player running for the ball
Generated Caption for keyframe_1.jpg: a football game is being played on the field
Generated Caption for keyframe_2.jpg: a football player is running with the ball
Generated Caption for keyframe_3.jpg: a football game with a player running for the ball
Generated Caption for keyframe_4.jpg: a football player is running for the ball
Generated Caption for keyframe_5.jpg: a football player is being restraineded by a referee
Generated Caption for keyframe_6.jpg: a football player is being restraineded by a referee
Generated Caption for keyframe_7.jpg: a football game is being played on the field
Generated Caption for keyframe_8.jpg: a football game is being played on the field
Generated Caption for keyframe_9.jpg: a football game is being played on the field
Generated Caption for keyframe_10.jpg: a football game is being played on the field
Generated Caption for keyframe_11.jpg: a football game with the

In [None]:
# Initialize BLIP-2 model and processor
# processor = BlipProcessor.from_pretrained("Salesforce/blip2-image-captioning")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip2-image-captioning")

# # Function to generate descriptions for all keyframes using BLIP-2
# def generate_keyframe_descriptions_blip2(keyframes_folder, descriptions_output_file):
#     # List all keyframes in the keyframes folder
#     keyframe_filenames = sorted(os.listdir(keyframes_folder), key=lambda x: int(x.split('_')[1].split('.')[0]))

#     descriptions = {}  # To store keyframe descriptions

#     for keyframe_filename in keyframe_filenames:
#         keyframe_path = os.path.join(keyframes_folder, keyframe_filename)
#         image = Image.open(keyframe_path)

#         # Preprocess and generate caption for the current keyframe using BLIP-2
#         inputs = processor(images=image, return_tensors="pt")
#         out = model.generate(**inputs)

#         # Decode the tokenized output to get the caption
#         caption = processor.decode(out[0], skip_special_tokens=True)
#         descriptions[keyframe_filename] = caption
#         print(f"Generated Caption for {keyframe_filename}: {caption}")

#     # Save descriptions to a file for later use
#     with open(descriptions_output_file, 'w') as f:
#         for keyframe_filename, caption in descriptions.items():
#             f.write(f"{keyframe_filename}: {caption}\n")

#     print(f"Descriptions for {len(keyframe_filenames)} keyframes saved to {descriptions_output_file}")

In [None]:
# # Paths for keyframes and output descriptions file
# # keyframes_folder = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Key-frames'  # Folder where keyframes are saved
# keyframes_folder = f'{base_path}/{video_number}/Key-frames'
# # descriptions_output_file = '/Users/adwaitmahadar/Disk-1/MSCS/Fall-24/CSCI-544/Project/Videos/Video-02/Descriptions/descriptions_blip2.txt'  # File to save the descriptions
# descriptions_output_file = f'{base_path}/{video_number}/Descriptions/descriptions_blip2.txt'

# # Generate descriptions for all keyframes using BLIP-2
# generate_keyframe_descriptions_blip2(keyframes_folder, descriptions_output_file)

In [None]:
# # Initialize OFA model and tokenizer
# tokenizer = OFATokenizer.from_pretrained("OFA-Sys/OFA-base")
# model = OFAModel.from_pretrained("OFA-Sys/OFA-base")
# model.eval()  # Set model to evaluation mode

In [None]:
# # Function to preprocess images and tokenize inputs for OFA model
# def preprocess_image(image):
#     image = image.convert("RGB")
#     return image

In [None]:
# # Function to generate descriptions for all keyframes using OFA
# def generate_keyframe_descriptions_ofa(keyframes_folder, descriptions_output_file):
#     # List all keyframes in the keyframes folder
#     keyframe_filenames = sorted(os.listdir(keyframes_folder), key=lambda x: int(x.split('_')[1].split('.')[0]))

#     descriptions = {}  # To store keyframe descriptions

#     for keyframe_filename in keyframe_filenames:
#         keyframe_path = os.path.join(keyframes_folder, keyframe_filename)
#         image = Image.open(keyframe_path)

#         # Preprocess the image for OFA
#         image = preprocess_image(image)

#         # Tokenize the input
#         inputs = tokenizer(image, return_tensors="pt", padding=True)

#         # Generate the caption
#         with torch.no_grad():
#             outputs = model.generate(inputs["input_ids"])

#         # Decode the tokenized output to get the caption
#         caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         descriptions[keyframe_filename] = caption
#         print(f"Generated Caption for {keyframe_filename}: {caption}")

#     # Save descriptions to a file for later use
#     with open(descriptions_output_file, 'w') as f:
#         for keyframe_filename, caption in descriptions.items():
#             f.write(f"{keyframe_filename}: {caption}\n")

#     print(f"Descriptions for {len(keyframe_filenames)} keyframes saved to {descriptions_output_file}")