In [1]:
# Install compatible torchvision for CUDA 12.x (12.4 detected)
# Using PyTorch 2.5.1 which is more stable with torchvision
# !pip install --upgrade --force-reinstall torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124

In [3]:
import torch
import os
import json
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# Load AWQ quantized model
# Note: Using "auto" for torch_dtype as recommended in the documentation
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct-AWQ", 
    torch_dtype="auto",
    device_map="auto"
)

# Optional: Enable flash_attention_2 for better performance
# Uncomment the following if you have flash_attention_2 installed:
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct-AWQ")

# Optional: Set custom min_pixels and max_pixels for performance tuning
# Uncomment and adjust as needed:
# min_pixels = 256 * 28 * 28
# max_pixels = 1280 * 28 * 28
# processor = AutoProcessor.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct-AWQ", 
#     min_pixels=min_pixels, 
#     max_pixels=max_pixels
# )

print("AWQ Model loaded!")

We suggest you to set `dtype=torch.float16` for better efficiency on CUDA/XPU with AWQ.


Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/1121 [00:00<?, ?it/s]

AWQ Model loaded!


In [None]:
# Settings
video_folder = "raw_videos"  # UPDATE THIS
transcript_folder = "transcript_from_audio"  # Transcripts are inside video folder
output_folder = "results"

# Single comprehensive prompt that asks all questions at once
comprehensive_prompt = """Analyze this video and transcript to answer each question specifically and directly.

**ANALYSIS TASKS:**

1. CELEBRITIES
List any celebrities or public figures visible. Format: [Name] - [context/role in video]
If none, respond: "None identified"

2. PEOPLE COUNT
Exact number of distinct individuals visible in video: [NUMBER]
If crowd/unclear, provide: [approximate range] (e.g., "~15-20 people")

3. GENDER
List each person's apparent gender presentation:
Person 1: [Male/Female/Not clearly visible]
Person 2: [Male/Female/Not clearly visible]
Format: [Person number/identifier]: [Gender]

4. ETHNICITY
Describe observed ethnic/racial characteristics per person:
Person 1: [Specific observable characteristics - e.g., "Light skin tone, blonde hair" OR "Dark skin tone" OR "East Asian appearance"]
Person 2: [Observable characteristics]
Use descriptive physical traits, not demographic labels.

5. ACTIVITIES
List all visible activities/sports:
- [Activity 1]: [brief description]
- [Activity 2]: [brief description]
Include location/setting context.

6. TRANSCRIPT ANALYSIS
Main topics discussed (if transcript available):
- Topic 1: [specific detail]
- Topic 2: [specific detail]
- Topic 3: [specific detail]
Key speakers (if identifiable): [names or descriptions]
If no transcript: "No transcript available"

**TRANSCRIPT PROVIDED:**
{transcript_info}

**RESPONSE INSTRUCTIONS:**
- Answer only what you observe
- Be specific, not vague
- Use exact numbers where possible
- Describe visible characteristics objectively
- Do not speculate
- Label each answer with its number"""

print(f"Video folder: {video_folder}")
print(f"Transcript folder: {transcript_folder}")
print("Using single comprehensive prompt for all analyses")

In [None]:
# Debug: Check transcript files availability
print("=== TRANSCRIPT FOLDER DEBUG ===")
print(f"Expected transcript folder path: {transcript_folder}")
if os.path.exists(transcript_folder):
    transcript_files = [f for f in os.listdir(transcript_folder) if f.endswith('.txt')]
    print(f"✓ Transcript folder exists: {transcript_folder}")
    print(f"Found {len(transcript_files)} .txt files:")
    for i, file in enumerate(transcript_files, 1):
        file_path = os.path.join(transcript_folder, file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
            print(f"  {i}. {file} ({len(content)} characters)")
        except Exception as e:
            print(f"  {i}. {file} (Error reading: {e})")
else:
    print(f"Transcript folder does not exist: {transcript_folder}")
    print("Creating the folder...")
    os.makedirs(transcript_folder, exist_ok=True)
    print(f"✓ Created folder: {transcript_folder}")
    print("Please add your transcript .txt files to this folder!")

print("\n=== VIDEO FOLDER DEBUG ===")
if os.path.exists(video_folder):
    video_files = [f for f in os.listdir(video_folder) if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]
    print(f"✓ Video folder exists: {video_folder}")
    print(f"Found {len(video_files)} video files:")
    for i, file in enumerate(video_files, 1):
        print(f"  {i}. {file}")
else:
    print(f"Video folder does not exist: {video_folder}")
    print("Please update the video_folder path in the settings above!")

print("\n=== EXPECTED TRANSCRIPT NAMING ===")
print("Based on your example: 24717902_TV_Excuse_for_Running_audio_transcript.txt")
if 'video_files' in locals() and video_files:
    for video_file in video_files[:3]:  # Show first 3 as examples
        video_base = os.path.splitext(video_file)[0]
        print(f"For video '{video_file}', looking for transcripts:")
        print(f"  - {video_base}_audio_transcript.txt  <- Most likely match")
        print(f"  - {video_file}_transcript.txt")
        print(f"  - {video_base}_transcript.txt")
        print(f"  - {video_base}.txt")
        print()
else:
    print("Example patterns that will be searched:")
    print("  - {video_name_without_ext}_audio_transcript.txt")
    print("  - {video_name}_transcript.txt") 
    print("  - {video_name_without_ext}_transcript.txt")
    print("  - Plus fuzzy matching for partial name matches")

In [None]:
# Function to load transcript for a video
def load_transcript(video_name):
    """Load transcript file if it exists for the given video name."""
    print(f"    Looking for transcript for video: {video_name}")
    
    # Extract base name without extension for matching
    video_base = os.path.splitext(video_name)[0]
    
    # Try different possible transcript filename patterns
    # Based on the example: 24717902_TV_Excuse_for_Running_audio_transcript.txt
    possible_names = [
        f"{video_base}_audio_transcript.txt",  # Most likely pattern
        f"{video_name}_transcript.txt",
        f"{video_base}_transcript.txt",
        f"{video_name}.txt",
        f"{video_base}.txt"
    ]
    
    print(f"    Checking transcript folder: {transcript_folder}")
    print(f"    Video base name: {video_base}")
    print(f"    Possible transcript names: {possible_names}")
    
    # List all files in transcript folder for debugging
    if os.path.exists(transcript_folder):
        all_transcript_files = os.listdir(transcript_folder)
        print(f"    Files in transcript folder: {all_transcript_files}")
        
        # Also try fuzzy matching - look for files that contain the video base name
        fuzzy_matches = [f for f in all_transcript_files if video_base in f and f.endswith('.txt')]
        if fuzzy_matches:
            print(f"    Fuzzy matches found: {fuzzy_matches}")
            # Use the first fuzzy match if exact matches fail
            possible_names.extend(fuzzy_matches)
    else:
        print(f"    Transcript folder does not exist: {transcript_folder}")
        return None
    
    for transcript_name in possible_names:
        transcript_path = os.path.join(transcript_folder, transcript_name)
        print(f"    Trying: {transcript_path}")
        if os.path.exists(transcript_path):
            try:
                with open(transcript_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                print(f"    ✓ Successfully loaded transcript: {transcript_name}")
                return content
            except Exception as e:
                print(f"    Warning: Could not read transcript {transcript_name}: {e}")
        else:
            print(f"    ✗ File not found: {transcript_name}")
    
    return None

# Enhanced analysis function with transcript integration
def analyze_video_with_transcript(video_path, video_name):
    """Analyze video with integrated transcript information."""
    
    # Load transcript if available
    transcript_text = load_transcript(video_name)
    
    # Prepare transcript information for the prompt
    if transcript_text:
        transcript_info = f"TRANSCRIPT:\n{transcript_text}\n\n"
        print(f"    ✓ Using transcript ({len(transcript_text)} characters)")
    else:
        transcript_info = "TRANSCRIPT: No transcript available for this video.\n\n"
        print(f"    ⚠ No transcript found - proceeding without transcript")
    
    # Format the comprehensive prompt with transcript
    full_prompt = comprehensive_prompt.format(transcript_info=transcript_info)
    
    # Debug: show the actual prompt being used
    print(f"    Prompt length: {len(full_prompt)} characters")
    
    # Prepare messages for the model
    messages = [{
        "role": "user",
        "content": [
            {"type": "video", "video": video_path},
            {"type": "text", "text": full_prompt}
        ]
    }]
    
    # Process with the model
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to("cuda")
    
    generated_ids = model.generate(**inputs, max_new_tokens=512)  # Increased tokens for comprehensive response
    generated_ids_trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)]
    response = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0].strip()
    
    return response, transcript_text

print("Enhanced analysis function with correct transcript path structure ready!")

In [None]:
# Process all videos with transcript integration
if not os.path.exists(video_folder):
    print(f"Folder not found: {video_folder}")
else:
    # Check if transcript folder exists
    if not os.path.exists(transcript_folder):
        print(f"Warning: Transcript folder not found: {transcript_folder}")
        print("Will proceed without transcripts")
    else:
        transcript_files = [f for f in os.listdir(transcript_folder) if f.endswith('.txt')]
        print(f"Found {len(transcript_files)} transcript files")
    
    # Find video files
    videos = [f for f in os.listdir(video_folder) if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]
    print(f"Found {len(videos)} videos")
    
    os.makedirs(output_folder, exist_ok=True)
    
    # Process each video
    for i, video_file in enumerate(videos, 1):
        print(f"\n[{i}/{len(videos)}] Processing: {video_file}")
        video_path = os.path.join(video_folder, video_file)
        video_name = os.path.splitext(video_file)[0]
        
        try:
            # Analyze with comprehensive prompt and transcript
            response, transcript_used = analyze_video_with_transcript(video_path, video_file)
            
            # Prepare results
            results = {
                "video": video_file,
                "transcript_available": transcript_used is not None,
                "transcript_text": transcript_used,
                "comprehensive_analysis": {
                    "prompt": comprehensive_prompt,
                    "response": response
                }
            }
            
            # Save results as JSON
            json_file = os.path.join(output_folder, f"{video_name}_analysis.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            
            # Save results as formatted text
            txt_file = os.path.join(output_folder, f"{video_name}_analysis.txt")
            with open(txt_file, 'w', encoding='utf-8') as f:
                f.write(f"VIDEO ANALYSIS: {video_file}\n")
                f.write("=" * 60 + "\n\n")
                
                if transcript_used:
                    f.write("TRANSCRIPT USED:\n")
                    f.write("-" * 20 + "\n")
                    f.write(f"{transcript_used}\n\n")
                else:
                    f.write("TRANSCRIPT: Not available\n\n")
                
                f.write("COMPREHENSIVE ANALYSIS:\n")
                f.write("-" * 25 + "\n")
                f.write(f"{response}\n\n")
                
                f.write("ANALYSIS PROMPT USED:\n")
                f.write("-" * 22 + "\n")
                f.write(comprehensive_prompt.format(transcript_info="[Transcript was inserted here if available]"))
            
            print(f"  ✓ Analysis completed successfully")
            print(f"  ✓ Saved: {video_name}_analysis.json and {video_name}_analysis.txt")
            
        except Exception as e:
            print(f"  Error processing {video_file}: {str(e)}")
            
            # Save error results
            error_results = {
                "video": video_file,
                "error": str(e),
                "transcript_available": False
            }
            
            error_file = os.path.join(output_folder, f"{video_name}_error.json")
            with open(error_file, 'w') as f:
                json.dump(error_results, f, indent=2)
    
    print(f"\nDone! Results saved to: {output_folder}")
    print("Each video now has comprehensive analysis with integrated transcript data")