In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/qwen2-vl/transformers/2b-instruct/1/model.safetensors.index.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/config.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/merges.txt
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/model-00001-of-00002.safetensors
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/LICENSE
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/model-00002-of-00002.safetensors
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/preprocessor_config.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/README.md
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/tokenizer.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/vocab.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/tokenizer_config.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/chat_template.json
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/.gitattributes
/kaggle/input/qwen2-vl/transformers/2b-instruct/1/generation_config.json
/kaggle/input/qwen2-vl/transformer

In [6]:
!pip install -q git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
!pip install -q optimum auto-gptq qwen-vl-utils[decord]

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.6/433.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.7/38.7 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [3

In [8]:
pip install gradio

Note: you may need to restart the kernel to use updated packages.


In [9]:
import gradio as gr
import torch
import cv2
from PIL import Image
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

class VideoAnalyzer:
    def __init__(self, model_size="2b"):
        """Initialize with specified model size (2b, 7b, or 72b)"""
        model_paths = {
            "2b": "/kaggle/input/qwen2-vl/transformers/2b-instruct/1",
            "7b": "/kaggle/input/qwen2-vl/transformers/7b-instruct/1",
            "72b": "/kaggle/input/qwen2-vl/transformers/72b-instruct/1"
        }
        
        model_path = model_paths[model_size]
        print(f"Loading model from: {model_path}")
        
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_path,
            torch_dtype="auto",
            device_map="auto"
        )
        self.processor = AutoProcessor.from_pretrained(model_path)

    def analyze_frame(self, frame, prompt):
        """Analyze a single frame with given prompt"""
        if not isinstance(frame, Image.Image):
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": frame,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        
        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )
        
        return output_text[0]

    def process_video(self, video_path, prompt, frame_interval=30):
        """Process video with adaptive analysis based on prompt type"""
        # Detect prompt type
        needs_timestamps = any(keyword in prompt.lower() for keyword in 
                             ['timestamp', 'when', 'time', 'moment', 'detect'])
        
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Error opening video file")
        
        frames = []
        timestamps = []
        frame_analyses = []
        frame_count = 0
        
        print("Processing video frames...")
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            if frame_count % frame_interval == 0:
                timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
                frames.append(frame)
                timestamps.append(timestamp)
                print(f"Captured frame at {timestamp:.2f}s")
            
            frame_count += 1
        
        cap.release()
        
        # Process frames
        for frame, timestamp in zip(frames, timestamps):
            analysis = self.analyze_frame(frame, prompt)
            frame_analyses.append({
                'timestamp': timestamp,
                'analysis': analysis
            })
            print(f"Processed frame at {timestamp:.2f}s")
        
        # Generate final output based on prompt type
        if needs_timestamps:
            # Return individual frame analyses for timestamp-specific queries
            return frame_analyses
        else:
            # Aggregate analysis for general summaries
            all_observations = [analysis['analysis'] for analysis in frame_analyses]
            summary_prompt = f"Based on these observations from different timestamps of the video, provide a comprehensive summary: {all_observations}"
            
            # Use the first frame as a reference frame for the summary
            if frames:
                final_summary = self.analyze_frame(frames[0], summary_prompt)
                return [{
                    'timestamp': 'Full Video',
                    'analysis': final_summary
                }]
        
        return []

def create_video_analysis_interface():
    """Create Gradio interface for video analysis"""
    # Function to manage model initialization
    def initialize_analyzer(model_size):
        """Initialize analyzer with selected model size"""
        return VideoAnalyzer(model_size=model_size)
    
    # Function to process video or webcam input
    def analyze_video(video, prompt, model_size):
        """Process video and format results"""
        try:
            # Initialize analyzer with selected model
            analyzer = initialize_analyzer(model_size)
            
            # Process the video
            results = analyzer.process_video(video, prompt)
            
            # Format results for display
            output_text = ""
            for result in results:
                if result['timestamp'] == 'Full Video':
                    output_text += "Complete Video Analysis:\n"
                else:
                    output_text += f"Timestamp: {result['timestamp']:.2f}s\n"
                output_text += f"Analysis: {result['analysis']}\n"
                output_text += "-" * 50 + "\n"
            
            return output_text
        
        except Exception as e:
            return f"An error occurred: {str(e)}"
    
    # Create Gradio interface with multiple inputs
    interface = gr.Interface(
        fn=analyze_video,
        inputs=[
            gr.Video(label="Upload Video"),
            gr.Textbox(label="Analysis Prompt"),
            gr.Dropdown(
                choices=["2b", "7b", "72b"], 
                value="2b", 
                label="Select Model Size"
            )
        ],
        outputs=gr.Textbox(label="Analysis Results"),
        title="Video Analysis with Qwen2-VL",
        description="Upload a video, choose a model, and provide a specific prompt for analysis!",
        examples=[
            [None, "Describe what's happening in this video", "2b"],
            [None, "Count the number of people in the scene", "7b"],
            [None, "Detect when a person enters or leaves the frame", "72b"]
        ]
    )
    
    return interface

# Launch the interface
def main():
    # Check GPU availability
    print("GPU available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU name:", torch.cuda.get_device_name(0))
    
    # Create and launch the interface
    interface = create_video_analysis_interface()
    interface.launch(share=True, pwa=True)

if __name__ == "__main__":
    main()

GPU available: True
GPU name: Tesla P100-PCIE-16GB
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://774abf0c5bb2592588.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
