In [None]:
# Install required packages for Qwen3-VL-30B-A3B-Instruct-FP8
# Requires vLLM for efficient inference with FP8 quantized model
# !pip install vllm
# !pip install qwen-vl-utils>=0.0.14
# !pip install transformers
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

print("⚠️ Make sure you have installed: vllm, qwen-vl-utils>=0.0.14, transformers")
print("⚠️ This model requires significant GPU memory (30B FP8 quantized)")
print("⚠️ Multiple GPUs recommended - will use tensor parallelism across all available GPUs")

In [None]:
import torch
import os
import json
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
from vllm import LLM, SamplingParams

# Set multiprocessing method for vLLM
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

# Model checkpoint path
MODEL_PATH = "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"

# Load processor with high resolution settings for better face recognition
processor = AutoProcessor.from_pretrained(
    MODEL_PATH,
    min_pixels=512 * 28 * 28,  # Increased for better face detail
    max_pixels=2048 * 28 * 28  # Higher resolution for enhanced celebrity recognition
)

# Initialize vLLM engine for efficient inference
# FP8 quantization provides near-BF16 performance with better efficiency
llm = LLM(
    model=MODEL_PATH,
    trust_remote_code=True,
    gpu_memory_utilization=0.70,
    enforce_eager=False,
    tensor_parallel_size=torch.cuda.device_count(),
    seed=0
)

print("✓ Qwen3-VL-30B-A3B-Instruct-FP8 Model loaded with vLLM!")
print("✓ FP8 quantization: Near-BF16 performance with improved efficiency")
print("✓ Enhanced features: Superior visual recognition, 256K context, celebrity recognition")
print(f"✓ Resolution settings: min_pixels={512 * 28 * 28}, max_pixels={2048 * 28 * 28}")
print(f"✓ Using {torch.cuda.device_count()} GPU(s) with tensor parallelism")

In [None]:
# Settings
video_folder = "raw_videos"  # UPDATE THIS
transcript_folder = "transcript_from_audio"  # Transcripts folder
output_folder = "results_qwen3vl_30b"  # Output folder for Qwen3-VL-30B results

# FPS setting for video processing
VIDEO_FPS = 2.0  # Process 2 frames per second for thorough analysis

# Sampling parameters optimized for Qwen3-VL
SAMPLING_PARAMS = SamplingParams(
    temperature=0.7,  # VL task optimized temperature
    max_tokens=1024,
    top_k=20,
    top_p=0.8,
    stop_token_ids=[]
)

# Single comprehensive prompt optimized for Qwen3-VL's enhanced capabilities
comprehensive_prompt = """Analyze this video and transcript to answer each question specifically and directly.

**ANALYSIS TASKS:**

1. CELEBRITIES
List any celebrities or public figures visible. Format: [Name] - [context/role in video]
If none, respond: "None identified"

2. PEOPLE COUNT
Exact number of distinct individuals visible in video: [NUMBER]
If crowd/unclear, provide: [approximate range] (e.g., "~15-20 people")

3. GENDER
List each person's apparent gender presentation:
Person 1: [Male/Female/Not clearly visible]
Person 2: [Male/Female/Not clearly visible]
Format: [Person number/identifier]: [Gender]

4. ETHNICITY
Describe observed ethnic/racial characteristics per person:
Person 1: [Specific observable characteristics - e.g., "Light skin tone, blonde hair" OR "Dark skin tone" OR "East Asian appearance"]
Person 2: [Observable characteristics]
Use descriptive physical traits, not demographic labels.

5. ACTIVITIES
List all visible activities/sports:
- [Activity 1]: [brief description]
- [Activity 2]: [brief description]
Include location/setting context.

6. TRANSCRIPT ANALYSIS
Main topics discussed (if transcript available):
- Topic 1: [specific detail]
- Topic 2: [specific detail]
- Topic 3: [specific detail]
Key speakers (if identifiable): [names or descriptions]
If no transcript: "No transcript available"

**TRANSCRIPT PROVIDED:**
{transcript_info}

**RESPONSE INSTRUCTIONS:**
- Answer only what you observe
- Be specific, not vague
- Use exact numbers where possible
- Describe visible characteristics objectively
- Do not speculate
- Label each answer with its number"""

print(f"Video folder: {video_folder}")
print(f"Transcript folder: {transcript_folder}")
print(f"Output folder: {output_folder}")
print(f"Video FPS: {VIDEO_FPS}")
print(f"Sampling params: temperature={SAMPLING_PARAMS.temperature}, max_tokens={SAMPLING_PARAMS.max_tokens}")
print("Using Qwen3-VL-30B-A3B-Instruct-FP8 with vLLM for efficient inference")

In [None]:
# Debug: Check transcript files availability
print("=== TRANSCRIPT FOLDER DEBUG ===")
print(f"Expected transcript folder path: {transcript_folder}")
if os.path.exists(transcript_folder):
    transcript_files = [f for f in os.listdir(transcript_folder) if f.endswith('.txt')]
    print(f"✓ Transcript folder exists: {transcript_folder}")
    print(f"Found {len(transcript_files)} .txt files:")
    for i, file in enumerate(transcript_files, 1):
        file_path = os.path.join(transcript_folder, file)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
            print(f"  {i}. {file} ({len(content)} characters)")
        except Exception as e:
            print(f"  {i}. {file} (Error reading: {e})")
else:
    print(f"Transcript folder does not exist: {transcript_folder}")
    print("Creating the folder...")
    os.makedirs(transcript_folder, exist_ok=True)
    print(f"✓ Created folder: {transcript_folder}")
    print("Please add your transcript .txt files to this folder!")

print("\n=== VIDEO FOLDER DEBUG ===")
if os.path.exists(video_folder):
    video_files = [f for f in os.listdir(video_folder) if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]
    print(f"✓ Video folder exists: {video_folder}")
    print(f"Found {len(video_files)} video files:")
    for i, file in enumerate(video_files, 1):
        print(f"  {i}. {file}")
else:
    print(f"Video folder does not exist: {video_folder}")
    print("Please update the video_folder path in the settings above!")

In [None]:
# Function to load transcript for a video
def load_transcript(video_name):
    """Load transcript file if it exists for the given video name."""
    print(f"    Looking for transcript for video: {video_name}")
    
    # Extract base name without extension for matching
    video_base = os.path.splitext(video_name)[0]
    
    # Try different possible transcript filename patterns
    possible_names = [
        f"{video_base}_audio_transcript.txt",
        f"{video_name}_transcript.txt",
        f"{video_base}_transcript.txt",
        f"{video_name}.txt",
        f"{video_base}.txt"
    ]
    
    # List all files in transcript folder for debugging
    if os.path.exists(transcript_folder):
        all_transcript_files = os.listdir(transcript_folder)
        
        # Try fuzzy matching - look for files that contain the video base name
        fuzzy_matches = [f for f in all_transcript_files if video_base in f and f.endswith('.txt')]
        if fuzzy_matches:
            print(f"    Fuzzy matches found: {fuzzy_matches}")
            possible_names.extend(fuzzy_matches)
    else:
        print(f"    Transcript folder does not exist: {transcript_folder}")
        return None
    
    for transcript_name in possible_names:
        transcript_path = os.path.join(transcript_folder, transcript_name)
        if os.path.exists(transcript_path):
            try:
                with open(transcript_path, 'r', encoding='utf-8') as f:
                    content = f.read().strip()
                print(f"    ✓ Successfully loaded transcript: {transcript_name}")
                return content
            except Exception as e:
                print(f"    Warning: Could not read transcript {transcript_name}: {e}")
    
    return None

# Helper function to prepare inputs for vLLM
def prepare_inputs_for_vllm(messages, processor):
    """Prepare inputs in vLLM format with video metadata."""
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Process vision info with video metadata (requires qwen_vl_utils 0.0.14+)
    image_inputs, video_inputs, video_kwargs = process_vision_info(
        messages,
        image_patch_size=processor.image_processor.patch_size,
        return_video_kwargs=True,
        return_video_metadata=True
    )
    
    mm_data = {}
    if image_inputs is not None:
        mm_data['image'] = image_inputs
    if video_inputs is not None:
        mm_data['video'] = video_inputs
    
    return {
        'prompt': text,
        'multi_modal_data': mm_data,
        'mm_processor_kwargs': video_kwargs
    }

# Enhanced analysis function using Qwen3-VL-30B with vLLM
def analyze_video_with_qwen3vl(video_path, video_name, fps=VIDEO_FPS):
    """Analyze video using Qwen3-VL-30B-A3B-Instruct-FP8 with vLLM.
    
    Args:
        video_path: Path to the video file
        video_name: Name of the video file
        fps: Target frames per second for video processing (default: 2.0)
    """
    
    # Load transcript if available
    transcript_text = load_transcript(video_name)
    
    # Prepare transcript information for the prompt
    if transcript_text:
        transcript_info = f"TRANSCRIPT:\n{transcript_text}\n\n"
        print(f"    ✓ Using transcript ({len(transcript_text)} characters)")
    else:
        transcript_info = "TRANSCRIPT: No transcript available for this video.\n\n"
        print(f"    ⚠ No transcript found - proceeding without transcript")
    
    # Format the comprehensive prompt with transcript
    full_prompt = comprehensive_prompt.format(transcript_info=transcript_info)
    
    print(f"    Prompt length: {len(full_prompt)} characters")
    print(f"    Video FPS setting: {fps}")
    
    # Prepare messages for Qwen3-VL model with FPS control
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": video_path,
                    "fps": fps  # Set FPS to 2.0 for better frame sampling
                },
                {"type": "text", "text": full_prompt}
            ]
        }
    ]
    
    # Prepare inputs for vLLM
    inputs = prepare_inputs_for_vllm(messages, processor)
    
    # Generate response using vLLM
    outputs = llm.generate([inputs], SAMPLING_PARAMS)
    response = outputs[0].outputs[0].text.strip()
    
    return response, transcript_text

print("✓ Qwen3-VL-30B analysis function ready with vLLM, FPS=2.0, and high resolution!")
print("✓ FP8 quantized for efficient inference with near-BF16 performance")
print("✓ Enhanced celebrity recognition with 30B parameter model")

In [None]:
# Process all videos with Qwen3-VL-30B model
if not os.path.exists(video_folder):
    print(f"Folder not found: {video_folder}")
else:
    # Check if transcript folder exists
    if not os.path.exists(transcript_folder):
        print(f"Warning: Transcript folder not found: {transcript_folder}")
        print("Will proceed without transcripts")
    else:
        transcript_files = [f for f in os.listdir(transcript_folder) if f.endswith('.txt')]
        print(f"Found {len(transcript_files)} transcript files")
    
    # Find video files
    videos = [f for f in os.listdir(video_folder) if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]
    print(f"Found {len(videos)} videos")
    
    os.makedirs(output_folder, exist_ok=True)
    
    # Process each video
    for i, video_file in enumerate(videos, 1):
        print(f"\n[{i}/{len(videos)}] Processing with Qwen3-VL-30B-FP8: {video_file}")
        video_path = os.path.join(video_folder, video_file)
        video_name = os.path.splitext(video_file)[0]
        
        try:
            # Analyze with Qwen3-VL-30B model
            response, transcript_used = analyze_video_with_qwen3vl(video_path, video_file)
            
            # Prepare results
            results = {
                "video": video_file,
                "model": "Qwen3-VL-30B-A3B-Instruct-FP8",
                "inference_engine": "vLLM",
                "fps": VIDEO_FPS,
                "resolution_settings": {
                    "min_pixels": 512 * 28 * 28,
                    "max_pixels": 2048 * 28 * 28
                },
                "sampling_params": {
                    "temperature": SAMPLING_PARAMS.temperature,
                    "max_tokens": SAMPLING_PARAMS.max_tokens,
                    "top_k": SAMPLING_PARAMS.top_k,
                    "top_p": SAMPLING_PARAMS.top_p
                },
                "transcript_available": transcript_used is not None,
                "transcript_text": transcript_used,
                "comprehensive_analysis": {
                    "prompt": comprehensive_prompt,
                    "response": response
                }
            }
            
            # Save results as JSON
            json_file = os.path.join(output_folder, f"{video_name}_qwen3vl30b_analysis.json")
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=2, ensure_ascii=False)
            
            # Save results as formatted text
            txt_file = os.path.join(output_folder, f"{video_name}_qwen3vl30b_analysis.txt")
            with open(txt_file, 'w', encoding='utf-8') as f:
                f.write(f"VIDEO ANALYSIS: {video_file}\n")
                f.write(f"MODEL: Qwen3-VL-30B-A3B-Instruct-FP8\n")
                f.write(f"INFERENCE ENGINE: vLLM\n")
                f.write(f"FPS: {VIDEO_FPS}\n")
                f.write(f"RESOLUTION: min={512 * 28 * 28}, max={2048 * 28 * 28}\n")
                f.write(f"TEMPERATURE: {SAMPLING_PARAMS.temperature}, TOP_K: {SAMPLING_PARAMS.top_k}, TOP_P: {SAMPLING_PARAMS.top_p}\n")
                f.write("=" * 70 + "\n\n")
                
                if transcript_used:
                    f.write("TRANSCRIPT USED:\n")
                    f.write("-" * 20 + "\n")
                    f.write(f"{transcript_used}\n\n")
                else:
                    f.write("TRANSCRIPT: Not available\n\n")
                
                f.write("COMPREHENSIVE ANALYSIS:\n")
                f.write("-" * 25 + "\n")
                f.write(f"{response}\n\n")
                
                f.write("ANALYSIS PROMPT USED:\n")
                f.write("-" * 22 + "\n")
                f.write(comprehensive_prompt.format(transcript_info="[Transcript was inserted here if available]"))
            
            print(f"  ✓ Analysis completed successfully")
            print(f"  ✓ Saved: {video_name}_qwen3vl30b_analysis.json and {video_name}_qwen3vl30b_analysis.txt")
            
        except Exception as e:
            print(f"  Error processing {video_file}: {str(e)}")
            import traceback
            traceback.print_exc()
            
            # Save error results
            error_results = {
                "video": video_file,
                "model": "Qwen3-VL-30B-A3B-Instruct-FP8",
                "error": str(e),
                "transcript_available": False
            }
            
            error_file = os.path.join(output_folder, f"{video_name}_error.json")
            with open(error_file, 'w') as f:
                json.dump(error_results, f, indent=2)
    
    print(f"\n✓ Done! Results saved to: {output_folder}")
    print("✓ Qwen3-VL-30B-A3B-Instruct-FP8 analysis complete")
    print("✓ Features: FP8 quantization, vLLM acceleration, superior celebrity recognition")
    print("✓ 30B parameters provide enhanced visual understanding and reasoning")