# Phase2

In [6]:
import json
import pandas as pd
import re

# Load the full JSON file
with open('captions/activity_net.v1-3.min.json', 'r') as f:
    data = json.load(f)

# Step 1: Flatten the data into a list of {video_id, caption}
records = []
for video_id, video_info in data['database'].items():
    for ann in video_info['annotations']:
        records.append({
            'video_id': video_id,
            'caption': ann['label']
        })

# Step 2: Convert to DataFrame
df = pd.DataFrame(records)

# Step 3: Filter for captions containing the word "car"
contains_car = df[df['caption'].str.contains(r'\bcar\b', case=False, na=False)]
car_video_ids = set(contains_car['video_id'])

# Step 4: Filter for videos with more than 8 captions
caption_counts = df['video_id'].value_counts()
videos_with_8plus = set(caption_counts[caption_counts > 3].index)

# Step 5: Intersection of both criteria
valid_video_ids = list(car_video_ids & videos_with_8plus)

# Step 6: Pick 10 video IDs (or fewer if not enough matches)
selected_video_ids = valid_video_ids[:10]

# Step 7: Get all captions for the selected videos
selected_df = df[df['video_id'].isin(selected_video_ids)]

# Print summary
print(f"Selected {len(selected_df)} captions from {len(selected_video_ids)} videos:")
print(selected_df.groupby("video_id").size())

# Optional: Preview some of the selected captions
print("\nSample captions:")
print(selected_df.head(10).to_string(index=False))


Selected 63 captions from 10 videos:
video_id
AT_pPlJTiyE     9
ELiXlJUBzzw     7
LGS_yzsScfw     4
VOyKKN3NdXM     6
XwW5p80hYCg     8
Z-6dR4H2dns     6
dL--vW-AJJo     5
iSH43hQoxio    10
orwTrxIwCpo     4
z8VqGGu5vPc     4
dtype: int64

Sample captions:
   video_id       caption
orwTrxIwCpo Hand car wash
orwTrxIwCpo Hand car wash
orwTrxIwCpo Hand car wash
orwTrxIwCpo Hand car wash
VOyKKN3NdXM Hand car wash
VOyKKN3NdXM Hand car wash
VOyKKN3NdXM Hand car wash
VOyKKN3NdXM Hand car wash
VOyKKN3NdXM Hand car wash
VOyKKN3NdXM Hand car wash


In [11]:
import json
import pandas as pd
import re
from pytube import YouTube
import av
import os

# --- CONFIGURATION ---
JSON_FILE = "captions/activity_net.v1-3.min.json"
OUTPUT_DIR = "extracted_frames"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- STEP 1: Load video caption metadata ---
with open(JSON_FILE, 'r') as f:
    full_data = json.load(f)

records = []
for vid, info in full_data["database"].items():
    for ann in info["annotations"]:
        records.append({
            "video_id": vid,
            "caption": ann["label"]
        })

df = pd.DataFrame(records)

# --- STEP 2: Filter videos by caption and count ---
contains_car = df[df["caption"].str.contains(r"\bcar\b", case=False, na=False)]
car_video_ids = set(contains_car["video_id"])

caption_counts = df["video_id"].value_counts()
videos_with_8plus = set(caption_counts[caption_counts > 8].index)

valid_video_ids = list(car_video_ids & videos_with_8plus)
selected_video_ids = valid_video_ids[:10]

print(f"Selected {len(selected_video_ids)} video IDs.")

# --- STEP 3: Download videos and extract frames ---
for video_id in selected_video_ids:
    info = full_data["database"][video_id]
    url = info.get("url")

    print(f"\nProcessing {video_id} from {url}")
    try:
        # Download video
        yt = YouTube(url)
        stream = yt.streams.filter(file_extension='mp4', progressive=True).order_by('resolution').desc().first()
        local_path = stream.download(filename=f"{video_id}.mp4")

        # Extract frames
        with av.open(local_path) as container:
            stream = container.streams.video[0]
            stream.codec_context.skip_frame = "NONKEY"  # Only keyframes

            for i, frame in enumerate(container.decode(stream)):
                out_path = os.path.join(OUTPUT_DIR, f"{video_id}_frame_{i:04d}.jpg")
                frame.to_image().save(out_path, quality=80)

        print(f"Frames saved for {video_id}")

        # Optional: clean up video file
        os.remove(local_path)

    except Exception as e:
        print(f"Failed to process {video_id}: {e}")


Selected 2 video IDs.

Processing iSH43hQoxio from https://www.youtube.com/watch?v=iSH43hQoxio
Failed to process iSH43hQoxio: iSH43hQoxio is unavailable

Processing AT_pPlJTiyE from https://www.youtube.com/watch?v=AT_pPlJTiyE
Failed to process AT_pPlJTiyE: HTTP Error 400: Bad Request


In [None]:
import pandas as pd
from pathlib import Path
import json # Added for load_captions if it were inlined, but it's an import

# --- Necessary imports from Script 1 functionality ---
from pytube import YouTube
import av
import os

def load_captions(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f) # data is a dict where keys are video_ids like "v_---"

    result = []
    # Assuming data is structured as: {"v_VIDEOID": {"timestamps": [[s,e],...], "sentences": ["...",...], "duration": X}}
    for vid_key, meta_info in data.items(): # vid_key is like "v_xxxx"
        # Ensure meta_info is the dictionary containing 'timestamps', 'sentences', 'duration'
        if isinstance(meta_info, dict) and 'timestamps' in meta_info and 'sentences' in meta_info:
            for (start, end), sentence in zip(meta_info['timestamps'], meta_info['sentences']):
                result.append({
                    "video_id": vid_key, # Store the original video key
                    "duration": meta_info.get("duration", 0), # Use .get for safety
                    "start": start,
                    "end": end,
                    "caption": sentence,
                    "video_url": f"https://www.youtube.com/watch?v={vid_key[2:]}" if vid_key.startswith("v_") else f"https://www.youtube.com/watch?v={vid_key}"
                })
        else:
            print(f"Warning: Skipping entry {vid_key} due to unexpected structure or missing keys.")


    return result
# --- End of load_captions definition ---


# Get current working directory
repo_root = Path().resolve()

# Build the relative path to the JSON file
# Make sure this path is correct for your environment
json_path = repo_root / "captions" / "train.json" # Example path

# --- CONFIGURATION for frame extraction (from Script 1) ---
OUTPUT_DIR = "extracted_frames_from_script2" # Changed name to avoid conflict
os.makedirs(OUTPUT_DIR, exist_ok=True)
# --- End of CONFIGURATION ---

# Load all captions
# Ensure train.json exists at the specified path and is in the expected format for load_captions
if not json_path.exists():
    print(f"Error: JSON file not found at {json_path}")
    print("Please ensure 'captions/train.json' exists relative to the script or provide the correct path.")
    exit()

print(f"Loading captions from: {json_path}")
data = load_captions(str(json_path))
if not data:
    print("No data loaded from captions file. Exiting.")
    exit()

df = pd.DataFrame(data)
if df.empty:
    print("DataFrame is empty after loading captions. Exiting.")
    exit()

# Step 1: Find videos that mention "car"
contains_car = df[df['caption'].str.contains(r'\bcar\b', case=False, na=False)]
car_video_ids = set(contains_car['video_id'])

# Step 2: Get videos with more than 8 captions
caption_counts = df['video_id'].value_counts()
videos_with_8plus = set(caption_counts[caption_counts > 8].index)

# Step 3: Intersection → videos that meet both criteria
valid_video_ids = list(car_video_ids & videos_with_8plus)

# Step 4: Pick just 10 videos from the intersection
# Renaming five_video_ids to selected_video_ids for clarity with Script 1's goal
selected_video_ids = valid_video_ids[:10]

print(f"\nIdentified {len(selected_video_ids)} video IDs for processing.")

# --- Get URLs for the selected video IDs ---
# The 'data' from load_captions already has video_id and video_url.
# We can create a mapping from video_id to its URL.
video_id_to_url_map = {}
if not df.empty and 'video_id' in df.columns and 'video_url' in df.columns:
    temp_df_for_urls = df[df['video_id'].isin(selected_video_ids)][['video_id', 'video_url']].drop_duplicates()
    video_id_to_url_map = pd.Series(temp_df_for_urls.video_url.values, index=temp_df_for_urls.video_id).to_dict()
else:
    print("DataFrame is missing 'video_id' or 'video_url' columns. Cannot proceed with URL mapping.")
    exit()

# --- STEP 5: Download videos and extract frames (adapted from Script 1) ---
print(f"\nStarting download and frame extraction for up to {len(selected_video_ids)} videos...")
processed_video_count = 0
for video_id in selected_video_ids:
    url = video_id_to_url_map.get(video_id)

    if not url:
        print(f"Could not find URL for video_id: {video_id}. Skipping.")
        continue

    print(f"\nProcessing {video_id} from {url}")
    local_path = "" # Initialize to ensure it's defined for finally block if needed
    try:
        # Download video
        yt = YouTube(url)
        # Filter for progressive MP4 streams and get the highest resolution available
        stream = yt.streams.filter(file_extension='mp4', progressive=True).order_by('resolution').desc().first()

        if not stream:
            print(f"No suitable MP4 stream found for {video_id} ({url}). Skipping.")
            continue

        print(f"Downloading video: {video_id}...")
        # Download to a temporary path or the main script directory
        # Using OUTPUT_DIR to keep downloads and frames somewhat together, then deleting.
        download_filename = f"{video_id}.mp4"
        local_path = stream.download(output_path=OUTPUT_DIR, filename=download_filename)
        print(f"Downloaded to {local_path}")

        # Extract frames
        print(f"Extracting frames for {video_id}...")
        with av.open(local_path) as container:
            video_stream = container.streams.video[0]
            video_stream.codec_context.skip_frame = "NONKEY"  # Decode only keyframes

            frames_extracted_count = 0
            for i, frame in enumerate(container.decode(video_stream)):
                out_frame_path = os.path.join(OUTPUT_DIR, f"{video_id}_frame_{i:04d}.jpg")
                frame.to_image().save(out_frame_path, quality=80)
                frames_extracted_count += 1
            print(f"{frames_extracted_count} frames saved for {video_id} in {OUTPUT_DIR}")
        
        processed_video_count +=1

    except Exception as e:
        print(f"Failed to process {video_id}: {e}")
    finally:
        # Optional: clean up video file
        if local_path and os.path.exists(local_path):
            try:
                print(f"Cleaning up video file: {local_path}")
                os.remove(local_path)
            except Exception as e_remove:
                print(f"Failed to remove video file {local_path}: {e_remove}")

print(f"\nFinished processing. {processed_video_count} videos had frames extracted.")

# --- Original Script 2's confirmation (can be kept or removed as needed) ---
# This part is about the selected *captions*, not the video processing.
if not df.empty and 'video_id' in df.columns:
    selected_captions_df = df[df['video_id'].isin(selected_video_ids)]
    if not selected_captions_df.empty:
        print(f"\nSelected {len(selected_captions_df)} captions from the processed video IDs:")
        print(selected_captions_df.groupby("video_id").size())
    else:
        print("\nNo captions found for the selected video IDs in the dataframe (this is unexpected if videos were processed).")
else:
    print("\nSkipping caption confirmation as DataFrame was not properly populated.")
