# Qwen2-VL Video Understanding Pipeline (Cleaned)
This notebook demonstrates how to set up and use Qwen2-VL for video understanding on Colab. It includes mounting Google Drive, installing dependencies, extracting video frames, and querying the model.

In [None]:
# Mount Google Drive to access video files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Install necessary packages
!pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
!apt-get update && apt-get install -y ffmpeg

In [None]:
# Import required libraries and initialize the Qwen2-VL model
import os
from transformers import AutoTokenizer, AutoProcessor, Qwen2VLForConditionalGeneration

# Initialize tokenizer, processor, and model
model_name = 'Qwen/Qwen2-VL-Chat-Base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
model = Qwen2VLForConditionalGeneration.from_pretrained(model_name).half().cuda()


In [None]:
# Define video and frame output paths
video_path = '/content/drive/MyDrive/CS231_project/pumunk_videos/594_1748124142.mp4'
frames_dir = '/content/drive/MyDrive/CS231_project/pumunk_videos/frames'

# Create output directory if it doesn't exist
os.makedirs(frames_dir, exist_ok=True)

# Extract frames using ffmpeg
os.system(f"ffmpeg -i {video_path} -q:v 2 {frames_dir}/%05d.jpg")

In [None]:
# Utility function to sample frames for model input
from math import ceil

def get_frame_list(output_path, fraction=0.0125):
    """
    Returns a list of frame file paths, sampled uniformly.
    fraction: proportion of total frames to return.
    """
    all_frames = sorted(f for f in os.listdir(output_path) if f.endswith('.jpg'))
    total = len(all_frames)
    step = max(1, ceil(total * fraction))
    return [os.path.join(output_path, f) for f in all_frames[::step]]

In [None]:
# Function to query the video model
def query_video(prompt, use_frames=True, frames_path=None, video_path=None):
    """
    Send a prompt to the Qwen2-VL model, using either sampled frames or full video.
    """
    if use_frames:
        # Process sampled frames
        frame_files = get_frame_list(frames_path)
        inputs = processor(image=frame_files, text=prompt, return_tensors='pt').to('cuda')
    else:
        # Process full video
        inputs = processor(video=video_path, text=prompt, return_tensors='pt').to('cuda')

    outputs = model.generate(**inputs)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [None]:
# Example 1: Describe the video using sampled frames
description = query_video(
    prompt='Describe the video in detail.',
    use_frames=True,
    frames_path=frames_dir
)
print(description)

# Example 2: Calculate velocity using full video
velocity = query_video(
    prompt='Calculate the velocity of the red dot.',
    use_frames=False,
    video_path=video_path
)
print(velocity)