In [1]:
!pip install opencv-python
!pip install pillow
!pip install transformers -U
!apt-get install ffmpeg
!pip install torchvision
!pip install sentencepiece


Collecting transformers
  Downloading transformers-4.48.3-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.3-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.48.2
    Uninstalling transformers-4.48.2:
      Successfully uninstalled transformers-4.48.2
Successfully installed transformers-4.48.3
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-1

In [2]:
import cv2
import torch
from PIL import Image
from google.colab import files
from tqdm import tqdm
import gc
import os
from PIL import Image
import torch
from transformers import LlavaProcessor, LlavaForConditionalGeneration
import time
import traceback

In [None]:
##does not summarize the text but gives frame by frame.

def load_model():
    """Load LLaVA model with correct configuration"""
    model_id = "llava-hf/llava-1.5-7b-hf"
    processor = LlavaProcessor.from_pretrained(model_id)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return processor, model

def extract_frames(video_path, sample_rate=2):
    """Extract frames with LLaVA-optimized size"""
    # Change target size to 336x336 (LLaVA 1.5's expected input size)
    target_size = (336, 336)
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error opening video file")

    frames = []
    timestamps = []

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = int(fps * sample_rate)

    with tqdm(total=total_frames // frame_interval, desc="Extracting frames") as pbar:
        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, target_size)  # Resize to 224x224
                img = Image.fromarray(img)
                frames.append(img)
                timestamps.append(frame_count / fps)
                pbar.update(1)

            frame_count += 1

    cap.release()
    return frames, timestamps


def analyze_frames(frames, processor, model, question):
    """Process frames one-by-one with proper formatting"""
    results = []

    # Use LLaVA's required chat template
    formatted_prompt = f"USER: <image>\n{question}\nASSISTANT:"

    for idx, image in enumerate(tqdm(frames, desc="Analyzing frames")):
        try:
            inputs = processor(
                text=formatted_prompt,
                images=image,
                return_tensors="pt"
            ).to(model.device)

            # Generate with adjusted parameters
            output = model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=processor.tokenizer.eos_token_id
            )

            result = processor.decode(output[0][2:], skip_special_tokens=True)
            results.append(result)

            # Clear memory after each frame
            del inputs
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error processing frame {idx}: {str(e)}")
            results.append("Analysis failed")

    return results



def main():
    try:
        # Load model first to reserve memory
        print("Loading LLaVA model...")
        processor, model = load_model()

        # Upload video
        print("\nUpload your video file (max 30 seconds for free Colab):")
        uploaded = files.upload()
        video_path = next(iter(uploaded))

        # Auto-calculate sample rate based on video length
        cap = cv2.VideoCapture(video_path)
        duration = cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
        cap.release()

        sample_rate = max(1, int(duration / 15))  # Get 15 samples max
        print(f"Auto-selected sample rate: {sample_rate} seconds")

        # Extract frames
        frames, timestamps = extract_frames(video_path, sample_rate)

        # Analyze frames
        question = "Describe the scene in detail including people, actions, objects, and context."
        results = analyze_frames(frames, processor, model, question)

        # Print results
        print("\n=== Analysis Results ===")
        for ts, res in zip(timestamps, results):
            print(f"[{ts:.1f}s] {res}")

    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        # Cleanup
        del model
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    main()

Loading LLaVA model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]




Upload your video file (max 30 seconds for free Colab):


Saving Garbage.mp4 to Garbage.mp4
Auto-selected sample rate: 1 seconds


Extracting frames: 100%|██████████| 28/28 [00:01<00:00, 23.35it/s]
Analyzing frames: 100%|██████████| 28/28 [15:14<00:00, 32.66s/it]



=== Analysis Results ===
[0.0s] ER:  
Describe the scene in detail including people, actions, objects, and context.
ASSISTANT: The image depicts a busy street scene with a mix of people, vehicles, and various objects scattered throughout the area. There are multiple motorcycles parked on the side of the road, with some cars and bicycles nearby. A couple of people can be seen walking around, while others are engaged in conversation or simply standing around.

In the center of the scene, a group of people is standing next to a motorcycle, possibly discussing or admiring it. Several bicycles and cars are also present, with one car positioned towards the left side of the scene and another car on the right side. The overall atmosphere of the street is lively, with people interacting and navigating the crowded space
[1.0s] ER:  
Describe the scene in detail including people, actions, objects, and context.
ASSISTANT: The image depicts a bustling city street with numerous people walking aroun

In [4]:
##summarizes text and gives final output.

def load_model():
    """Load LLaVA model with correct configuration"""
    model_id = "llava-hf/llava-1.5-7b-hf"
    processor = LlavaProcessor.from_pretrained(model_id)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return processor, model

def summarize_descriptions(descriptions):
    """Summarize frame descriptions efficiently using batching"""
    from transformers import pipeline

    summarizer = pipeline("summarization", model="t5-small", device=0 if torch.cuda.is_available() else -1)

    combined_text = " ".join(descriptions)
    max_chunk_length = 1024
    chunks = [combined_text[i:i + max_chunk_length] for i in range(0, len(combined_text), max_chunk_length)]

    # Batch process instead of looping
    summaries = summarizer(chunks, max_length=150, min_length=50, do_sample=False)

    return " ".join(summary["summary_text"] for summary in summaries)


def extract_frames(video_path, sample_rate=2):
    """Extract frames with memory optimization for larger videos"""
    target_size = (336, 336)
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error opening video file")

    frames = []
    timestamps = []

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps

    # Calculate sample rate based on video duration
    if duration > 15:
        sample_rate = max(sample_rate, int(duration / 8))  # Max 8 frames for longer videos

    frame_interval = int(fps * sample_rate)
    max_frames = 8  # Hard limit on number of frames

    print(f"Video duration: {duration:.1f}s, Sampling every {sample_rate} seconds")

    with tqdm(total=min(total_frames // frame_interval, max_frames), desc="Extracting frames") as pbar:
        frame_count = 0
        frames_extracted = 0

        while cap.isOpened() and frames_extracted < max_frames:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, target_size)
                img = Image.fromarray(img)
                frames.append(img)
                timestamps.append(frame_count / fps)
                frames_extracted += 1
                pbar.update(1)

                # Clear memory
                del frame
                gc.collect()

            frame_count += 1

    cap.release()
    print(f"Extracted {len(frames)} frames for analysis")
    return frames, timestamps

def analyze_frames(frames, processor, model, question):
    """Process frames with improved memory management"""
    results = []
    formatted_prompt = f"USER: <image>\n{question}\nASSISTANT:"

    for idx, image in enumerate(tqdm(frames, desc="Analyzing frames")):
        try:
            # Clear cache before processing each frame
            torch.cuda.empty_cache()
            gc.collect()

            inputs = processor(
                text=formatted_prompt,
                images=image,
                return_tensors="pt"
            )

            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.inference_mode():
                output = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=False,
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=processor.tokenizer.eos_token_id,
                    use_cache=False
                )

            result = processor.decode(output[0], skip_special_tokens=True)
            if "USER:" in result:
                result = result.split("ASSISTANT:")[-1].strip()
            results.append(result)

            # Clear memory
            del inputs, output, image
            torch.cuda.empty_cache()
            gc.collect()

            # Add small delay for memory cleanup
            time.sleep(0.5)

        except Exception as e:
            print(f"Error processing frame {idx}: {str(e)}")
            traceback.print_exc()
            results.append("Analysis failed")

    return results

def main():
    try:
        print("Loading LLaVA model...")
        processor, model = load_model()

        print(f"Model device: {model.device}")
        if torch.cuda.is_available():
            print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f}MB")

        user_prompt = input("Enter your question about the video (e.g., 'What is happening in this scene?'): ")

        print("\nUpload your video file (max 30 seconds for free Colab):")
        uploaded = files.upload()
        video_path = next(iter(uploaded))

        # Extract frames with memory optimization
        frames, timestamps = extract_frames(video_path)

        # Clear memory before analysis
        torch.cuda.empty_cache()
        gc.collect()

        if torch.cuda.is_available():
            print(f"GPU Memory before analysis: {torch.cuda.memory_allocated(0) / 1024**2:.2f}MB")

        results = analyze_frames(frames, processor, model, user_prompt)

        print("\n=== Frame-by-Frame Analysis ===")
        for ts, res in zip(timestamps, results):
            print(f"[{ts:.1f}s] {res}")

        # Generate summary for valid results
        if any(res != "Analysis failed" for res in results):
            print("\n=== Overall Summary ===")
            valid_results = [res for res in results if res != "Analysis failed"]
            torch.cuda.empty_cache()
            gc.collect()
            summary = summarize_descriptions(valid_results)
            print(summary)
        else:
            print("\nNo valid analysis results to summarize.")

    except Exception as e:
        print(f"Error: {str(e)}")
        traceback.print_exc()
    finally:
        del model
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    import time
    import traceback
    main()

Loading LLaVA model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Model device: cuda:0
Initial GPU Memory: 12845.12MB
Enter your question about the video (e.g., 'What is happening in this scene?'): what is happening in this video ?

Upload your video file (max 30 seconds for free Colab):


Saving WhatsApp Video 2025-02-09 at 04.16.40_c7897be9.mp4 to WhatsApp Video 2025-02-09 at 04.16.40_c7897be9 (1).mp4
Video duration: 2.3s, Sampling every 2 seconds


Extracting frames: 2it [00:00,  2.55it/s]


Extracted 2 frames for analysis
GPU Memory before analysis: 12845.12MB


Analyzing frames: 100%|██████████| 2/2 [01:51<00:00, 55.82s/it]



=== Frame-by-Frame Analysis ===
[0.0s] The video captures a snowy street scene with a car driving down the road. The street is lined with houses, and there are multiple cars parked or driving along the road. The snow-covered street and the presence of cars suggest that the image was taken during the winter season. The overall atmosphere of the scene is calm and quiet, with no visible signs of activity or people.
[2.0s] In the video, a snowy street is shown with a few cars parked along the side. There is a street sign on the sidewalk, and a truck is parked near the sign. The scene also features a few people walking on the sidewalk, possibly going about their daily activities despite the snowy weather. The overall atmosphere of the video is that of a typical winter day in a residential area.

=== Overall Summary ===


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


the video captures a snowy street scene with a car driving down the road . the street is lined with houses, and there are multiple cars parked or driving along the road. the overall atmosphere of the scene is calm and quiet, with no visible signs of activity .


In [None]:
##worked for that fire video, not working for more vids but lemme test.

def load_model():
    """Load LLaVA model with correct configuration"""
    model_id = "llava-hf/llava-1.5-7b-hf"
    processor = LlavaProcessor.from_pretrained(model_id)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return processor, model

def summarize_descriptions(descriptions):
    """Summarize frame descriptions efficiently using batching"""
    from transformers import pipeline

    summarizer = pipeline("summarization", model="t5-small", device=0 if torch.cuda.is_available() else -1)

    combined_text = " ".join(descriptions)
    max_chunk_length = 1024
    chunks = [combined_text[i:i + max_chunk_length] for i in range(0, len(combined_text), max_chunk_length)]

    # Batch process instead of looping
    summaries = summarizer(chunks, max_length=150, min_length=50, do_sample=False)

    return " ".join(summary["summary_text"] for summary in summaries)


def extract_frames(video_path, sample_rate=2):
    """Extract frames with memory optimization for larger videos"""
    target_size = (336, 336)
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error opening video file")

    frames = []
    timestamps = []

    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps

    # Calculate sample rate based on video duration
    if duration > 15:
        sample_rate = max(sample_rate, int(duration / 8))  # Max 8 frames for longer videos

    frame_interval = int(fps * sample_rate)
    max_frames = 8  # Hard limit on number of frames

    print(f"Video duration: {duration:.1f}s, Sampling every {sample_rate} seconds")

    with tqdm(total=min(total_frames // frame_interval, max_frames), desc="Extracting frames") as pbar:
        frame_count = 0
        frames_extracted = 0

        while cap.isOpened() and frames_extracted < max_frames:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_interval == 0:
                img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, target_size)
                img = Image.fromarray(img)
                frames.append(img)
                timestamps.append(frame_count / fps)
                frames_extracted += 1
                pbar.update(1)

                # Clear memory
                del frame
                gc.collect()

            frame_count += 1

    cap.release()
    print(f"Extracted {len(frames)} frames for analysis")
    return frames, timestamps

def analyze_frames(frames, processor, model, question):
    """Process frames with improved memory management"""
    results = []
    formatted_prompt = f"USER: <image>\n{question}\nASSISTANT:"

    for idx, image in enumerate(tqdm(frames, desc="Analyzing frames")):
        try:
            # Clear cache before processing each frame
            torch.cuda.empty_cache()
            gc.collect()

            inputs = processor(
                text=formatted_prompt,
                images=image,
                return_tensors="pt"
            )

            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.inference_mode():
                output = model.generate(
                    **inputs,
                    max_new_tokens=100,
                    do_sample=False,
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=processor.tokenizer.eos_token_id,
                    use_cache=False
                )

            result = processor.decode(output[0], skip_special_tokens=True)
            if "USER:" in result:
                result = result.split("ASSISTANT:")[-1].strip()
            results.append(result)

            # Clear memory
            del inputs, output, image
            torch.cuda.empty_cache()
            gc.collect()

            # Add small delay for memory cleanup
            time.sleep(0.5)

        except Exception as e:
            print(f"Error processing frame {idx}: {str(e)}")
            traceback.print_exc()
            results.append("Analysis failed")

    return results

def main():
    try:
        print("Loading LLaVA model...")
        processor, model = load_model()

        print(f"Model device: {model.device}")
        if torch.cuda.is_available():
            print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f}MB")

        user_prompt = input("Enter your question about the video (e.g., 'What is happening in this scene?'): ")

        print("\nUpload your video file (max 30 seconds for free Colab):")
        uploaded = files.upload()
        video_path = next(iter(uploaded))

        # Extract frames with memory optimization
        frames, timestamps = extract_frames(video_path)

        # Clear memory before analysis
        torch.cuda.empty_cache()
        gc.collect()

        if torch.cuda.is_available():
            print(f"GPU Memory before analysis: {torch.cuda.memory_allocated(0) / 1024**2:.2f}MB")

        results = analyze_frames(frames, processor, model, user_prompt)

        print("\n=== Frame-by-Frame Analysis ===")
        for ts, res in zip(timestamps, results):
            print(f"[{ts:.1f}s] {res}")

        # Generate summary for valid results
        if any(res != "Analysis failed" for res in results):
            print("\n=== Overall Summary ===")
            valid_results = [res for res in results if res != "Analysis failed"]
            torch.cuda.empty_cache()
            gc.collect()
            summary = summarize_descriptions(valid_results)
            print(summary)
        else:
            print("\nNo valid analysis results to summarize.")

    except Exception as e:
        print(f"Error: {str(e)}")
        traceback.print_exc()
    finally:
        del model
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    import time
    import traceback
    main()

Loading LLaVA model...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Model device: cuda:0
Initial GPU Memory: 1264.66MB
Enter your question about the video (e.g., 'What is happening in this scene?'): what is happening in this video ? explain

Upload your video file (max 30 seconds for free Colab):


Saving geravid.mp4 to geravid.mp4
Video duration: 14.6s, Sampling every 2 seconds


Extracting frames: 8it [00:04,  1.93it/s]


Extracted 8 frames for analysis
GPU Memory before analysis: 1264.66MB


Analyzing frames:   0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
##to be tested.

def load_model():
    """Load LLaVA model with correct configuration and set it to eval mode."""
    model_id = "llava-hf/llava-1.5-7b-hf"
    processor = LlavaProcessor.from_pretrained(model_id)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    model.eval()  # Set to evaluation mode
    return processor, model

def summarize_descriptions(descriptions):
    """Summarize frame descriptions using a lightweight model."""
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn",
                          device=0 if torch.cuda.is_available() else -1)

    combined_text = " ".join(descriptions)
    max_chunk_length = 1024
    chunks = [combined_text[i:i + max_chunk_length] for i in range(0, len(combined_text), max_chunk_length)]

    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    return " ".join(summaries)

def extract_frames(video_path, sample_rate=2):
    """Extract frames with memory optimization and FPS safety checks."""
    target_size = (336, 336)
    if not os.path.exists(video_path):
        raise FileNotFoundError(f"Video file not found: {video_path}")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error opening video file")

    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps <= 0:
        print("FPS not detected or zero; defaulting to 30")
        fps = 30.0

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps

    # Adjust sample_rate for longer videos
    if duration > 15:
        sample_rate = max(sample_rate, int(duration / 8))
    frame_interval = int(fps * sample_rate)
    max_frames = 8

    print(f"Video duration: {duration:.1f}s, Sampling every {sample_rate} seconds (frame interval: {frame_interval})")

    frames = []
    timestamps = []
    frame_count = 0
    frames_extracted = 0

    with tqdm(total=min(total_frames // frame_interval, max_frames), desc="Extracting frames") as pbar:
        while cap.isOpened() and frames_extracted < max_frames:
            ret, frame = cap.read()
            if not ret:
                break
            if frame_count % frame_interval == 0:
                img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, target_size)
                img = Image.fromarray(img)
                frames.append(img)
                timestamps.append(frame_count / fps)
                frames_extracted += 1
                pbar.update(1)
                # Optional: clear frame reference
                del frame
            frame_count += 1

    cap.release()
    print(f"Extracted {len(frames)} frames for analysis")
    return frames, timestamps

def analyze_frames(frames, processor, model, question):
    """Process frames with improved memory management and logging."""
    results = []
    formatted_prompt = f"USER: <image>\n{question}\nASSISTANT:"

    for idx, image in enumerate(tqdm(frames, desc="Analyzing frames")):
        try:
            print(f"Processing frame {idx}...")
            # Clear caches less aggressively; you can adjust frequency if needed
            torch.cuda.empty_cache()

            inputs = processor(
                text=formatted_prompt,
                images=image,
                return_tensors="pt"
            )
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.inference_mode():
                output = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    do_sample=True,         # Consider setting to False for deterministic results on short texts
                    temperature=0.7,
                    top_p=0.9,
                    pad_token_id=processor.tokenizer.eos_token_id,
                    use_cache=True
                )

            result = processor.decode(output[0], skip_special_tokens=True)
            # Remove the prompt portion if present
            if "USER:" in result:
                result = result.split("ASSISTANT:")[-1].strip()
            results.append(result)

            # Cleanup per iteration; reduce sleep delay
            del inputs, output
            torch.cuda.empty_cache()
            gc.collect()
            time.sleep(0.1)  # Reduced delay

        except Exception as e:
            print(f"Error processing frame {idx}: {str(e)}")
            traceback.print_exc()
            results.append("Analysis failed")

    return results

def main():
    try:
        print("Loading LLaVA model...")
        processor, model = load_model()
        print(f"Model device: {model.device}")
        if torch.cuda.is_available():
            print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f}MB")

        user_prompt = input("Enter your question about the video (e.g., 'What is happening in this scene?'): ")
        print("\nUpload your video file (max 30 seconds for free Colab):")
        uploaded = files.upload()
        video_path = next(iter(uploaded))

        frames, timestamps = extract_frames(video_path)
        torch.cuda.empty_cache()
        gc.collect()

        if torch.cuda.is_available():
            print(f"GPU Memory before analysis: {torch.cuda.memory_allocated(0) / 1024**2:.2f}MB")

        results = analyze_frames(frames, processor, model, user_prompt)

        print("\n=== Frame-by-Frame Analysis ===")
        for ts, res in zip(timestamps, results):
            print(f"[{ts:.1f}s] {res}")

        valid_results = [res for res in results if res != "Analysis failed"]
        if valid_results:
            print("\n=== Overall Summary ===")
            summary = summarize_descriptions(valid_results)
            print(summary)
        else:
            print("\nNo valid analysis results to summarize.")

    except Exception as e:
        print(f"Error: {str(e)}")
        traceback.print_exc()
    finally:
        del model
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    import time
    import traceback
    main()
