In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mydata/Retail Store Outlet.mp4
/kaggle/input/mydata/Two Wheeler Safety.mp4
/kaggle/input/mydata/Fire Outbreak.mp4
/kaggle/input/mydata/Garbage.mp4
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/model.safetensors.index.json
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/config.json
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/merges.txt
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/LICENSE
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/preprocessor_config.json
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/model-00005-of-00005.safetensors
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/model-00001-of-00005.safetensors
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/README.md
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/model-00002-of-00005.safetensors
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/tokenizer.json
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/vocab.json
/kaggle/input/qwen2-vl/transformers/7b-instruct/1/tokenizer_config.js

In [1]:
!pip install -q git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
!pip install -q optimum auto-gptq qwen-vl-utils[decord]

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.6/433.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.7/38.7 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m80.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 

In [3]:
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from qwen_vl_utils import process_vision_info
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
import torch
from collections import defaultdict

class VideoAnalyzer:
    def __init__(self, model_size="2b"):
        """Initialize with specified model size (2b, 7b, or 72b)"""
        model_paths = {
            "2b": "/kaggle/input/qwen2-vl/transformers/2b-instruct/1",
            "7b": "/kaggle/input/qwen2-vl/transformers/7b-instruct/1",
            "72b": "/kaggle/input/qwen2-vl/transformers/72b-instruct/1"
        }
        
        model_path = model_paths[model_size]
        print(f"Loading model from: {model_path}")
        
        self.model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_path,
            torch_dtype="auto",
            device_map="auto"
        )
        self.processor = AutoProcessor.from_pretrained(model_path)

    def analyze_frame(self, frame, prompt):
        """Analyze a single frame with given prompt"""
        if not isinstance(frame, Image.Image):
            frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": frame,
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ]
        
        text = self.processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = self.processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        
        generated_ids = self.model.generate(**inputs, max_new_tokens=128)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, 
            skip_special_tokens=True, 
            clean_up_tokenization_spaces=False
        )
        
        return output_text[0]

    def process_video(self, video_path, prompt, frame_interval=30):
        """Process video with adaptive analysis based on prompt type"""
        # Detect prompt type
        needs_timestamps = any(keyword in prompt.lower() for keyword in 
                             ['timestamp', 'when', 'time', 'moment', 'detect'])
        
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            raise ValueError("Error opening video file")
        
        frames = []
        timestamps = []
        frame_analyses = []
        frame_count = 0
        
        print("Processing video frames...")
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            
            if frame_count % frame_interval == 0:
                timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
                frames.append(frame)
                timestamps.append(timestamp)
                print(f"Captured frame at {timestamp:.2f}s")
            
            frame_count += 1
        
        cap.release()
        
        # Process frames
        for frame, timestamp in zip(frames, timestamps):
            analysis = self.analyze_frame(frame, prompt)
            frame_analyses.append({
                'timestamp': timestamp,
                'analysis': analysis
            })
            print(f"Processed frame at {timestamp:.2f}s")
        
        # Generate final output based on prompt type
        if needs_timestamps:
            # Return individual frame analyses for timestamp-specific queries
            return frame_analyses
        else:
            # Aggregate analysis for general summaries
            all_observations = [analysis['analysis'] for analysis in frame_analyses]
            summary_prompt = f"Based on these observations from different timestamps of the video, provide a comprehensive summary: {all_observations}"
            
            # Use the first frame as a reference frame for the summary
            if frames:
                final_summary = self.analyze_frame(frames[0], summary_prompt)
                return [{
                    'timestamp': 'Full Video',
                    'analysis': final_summary
                }]
        
        return []



In [5]:
def main():
    print("GPU available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU name:", torch.cuda.get_device_name(0))
    
    video_path = "/kaggle/input/mydata/Two Wheeler Safety.mp4"
    prompt = input("Enter your prompt for video analysis: ")
    
    analyzer = VideoAnalyzer(model_size="7b")
    results = analyzer.process_video(video_path, prompt)
    
    # Print results
    for result in results:
        if result['timestamp'] == 'Full Video':
            print("\nComplete Video Analysis:")
        else:
            print(f"\nTimestamp: {result['timestamp']:.2f}s")
        print(f"Analysis: {result['analysis']}")
        print("-" * 50)

if __name__ == "__main__":
    main()

GPU available: True
GPU name: Tesla P100-PCIE-16GB


Enter your prompt for video analysis:  How many bikers are riding their bike without helmet? If possible can you give me the timestamp


Loading model from: /kaggle/input/qwen2-vl/transformers/7b-instruct/1




Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Processing video frames...
Captured frame at 0.00s
Captured frame at 1.20s
Captured frame at 2.40s
Captured frame at 3.60s
Captured frame at 4.80s
Captured frame at 6.00s
Captured frame at 7.20s
Captured frame at 8.40s
Captured frame at 9.60s
Captured frame at 10.80s
Captured frame at 12.00s
Captured frame at 13.20s
Captured frame at 14.40s
Captured frame at 15.60s
Captured frame at 16.80s
Captured frame at 18.00s
Captured frame at 19.20s
Captured frame at 20.40s
Captured frame at 21.60s
Captured frame at 22.80s
Captured frame at 24.00s
Processed frame at 0.00s
Processed frame at 1.20s
Processed frame at 2.40s
Processed frame at 3.60s
Processed frame at 4.80s
Processed frame at 6.00s
Processed frame at 7.20s
Processed frame at 8.40s
Processed frame at 9.60s
Processed frame at 10.80s
Processed frame at 12.00s
Processed frame at 13.20s
Processed frame at 14.40s
Processed frame at 15.60s
Processed frame at 16.80s
Processed frame at 18.00s
Processed frame at 19.20s
Processed frame at 20.40