# 1. Environment Setup and Data Unpacking

This initial section handles the crucial setup steps for the Colab environment. The process is organized as follows:

1.  **Configuration**: Key paths and filenames are defined as variables at the top for easy modification and clarity.
2.  **Mount Google Drive**: The Colab environment is connected to the user's Google Drive to access the project's data archive.
3.  **Unpack Dataset**: The data (`.zip` archive) is located on Drive and then unpacked into the local Colab filesystem.

This approach is chosen for performance reasons, as reading data from the local Colab storage is significantly faster than from Google Drive.


In [None]:
import os
from google.colab import drive

# --- 1. Configuration ---
# Define all necessary paths and filenames here.
GDRIVE_DATA_PATH = "/content/drive/MyDrive/EgoVisionProject/Data"
ZIP_FILE_NAME    = "ego4d_data.zip"
LOCAL_DATA_PATH  = "/content/data" # Local, temporary workspace

print("Configuration set.")

# --- 2. Mount Google Drive ---
print("Connecting to Google Drive...")
drive.mount('/content/drive', force_remount=True)
print("Drive connected.")

# --- 3. Unpack the Dataset ---
gdrive_zip_file = os.path.join(GDRIVE_DATA_PATH, ZIP_FILE_NAME)
print(f"\nLooking for dataset archive at: {gdrive_zip_file}")

if os.path.exists(gdrive_zip_file):
    print("Dataset archive found. Unpacking to local storage...")
    os.makedirs(LOCAL_DATA_PATH, exist_ok=True)

    # Unzip directly from Drive to the local path.
    !unzip -o -q "{gdrive_zip_file}" -d "{LOCAL_DATA_PATH}"
    print("Unpacking complete.")

    # Verification Step
    print("\n--- Verifying contents of local data storage... ---")
    !ls -lH "{LOCAL_DATA_PATH}/ego4d_data/v1/annotations"
    print("---------------------------------------------")
else:
    print(f"ERROR: Dataset archive not found at the specified path.")

# 2. Load Annotations into a Pandas DataFrame

With the data unpacked into the local environment, the next step is to load the `nlq_train.json` annotations file. The raw JSON has a nested structure, which we will flatten to create a structured `pandas` DataFrame. Each row in the DataFrame will represent a single language query, making it the ideal format for subsequent analysis and exploration.

In [None]:
import json
import pandas as pd

annotations_file = os.path.join(LOCAL_DATA_PATH, 'ego4d_data/v1/annotations/nlq_train.json')
all_queries = []

try:
    with open(annotations_file, 'r') as f:
        nlq_data = json.load(f)
    print(f"Successfully loaded '{annotations_file}'")

    # Flatten the JSON structure to extract each query and its context
    for video in nlq_data.get('videos', []):
        for clip in video.get('clips', []):
            # Calculate the full duration of the parent clip
            clip_duration = clip.get('video_end_sec', 0) - clip.get('video_start_sec', 0)

            for annotation in clip.get('annotations', []):
                for query in annotation.get('language_queries', []):
                    query['video_uid'] = video['video_uid']
                    query['clip_uid'] = clip['clip_uid']
                    query['clip_duration_sec'] = clip_duration  # Add parent clip's duration
                    all_queries.append(query)

    # Create the DataFrame
    df_queries = pd.DataFrame(all_queries)
    print(f"\nSuccessfully created DataFrame with {len(df_queries)} queries.")
    display(df_queries.head())

except FileNotFoundError:
    print(f"ERROR: Annotation file not found. Please ensure Cell 1 was executed successfully.")

# 3. Exploratory Data Analysis (EDA) - Template Distribution

As a first step in our exploratory data analysis, we investigate the composition of the training set by analyzing the distribution of query templates. The Ego4D-NLQ benchmark defines 13 query templates across three main categories (Objects, Place, People). By visualizing these counts, we can gain insight into which types of questions are most prevalent in the dataset, which can inform our modeling and data augmentation strategies.

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the DataFrame from the previous cell exists
if 'df_queries' in locals():
    # Set plot style
    sns.set_theme(style="whitegrid")

    # Count the occurrences of each template
    template_counts = df_queries['template'].value_counts()

    plt.figure(figsize=(10, 8)) # Create a figure for the plot

    # Create a horizontal bar plot for better readability of labels
    sns.barplot(x=template_counts.values, y=template_counts.index, palette="viridis")

    plt.title('Distribution of Query Templates in the Training Set', fontsize=16)
    plt.xlabel('Number of Queries', fontsize=12)
    plt.ylabel('Query Template', fontsize=12)
    plt.tight_layout() # Adjust layout to make room for labels
    plt.show()

    # Also print the exact counts for the report
    print("\nQuery counts per template:")
    print(template_counts)

else:
    print("ERROR: DataFrame 'df_queries' not found. Please run Cell 2 first.")

# 4. EDA - Analysis of Answer Segment Durations

Another crucial aspect of the dataset is the temporal duration of the ground-truth "answer" segments. This analysis helps us understand whether the model needs to localize short, precise moments or longer, drawn-out activities. We calculate the duration for each query and visualize its distribution using a histogram. This can inform decisions about the model's temporal attention mechanisms and loss function design.

In [None]:
import numpy as np

if 'df_queries' in locals():
    # Calculate the duration of each answer segment
    df_queries['answer_duration_sec'] = df_queries['video_end_sec'] - df_queries['video_start_sec']

    # --- Plotting the distribution ---
    plt.figure(figsize=(12, 6))
    sns.histplot(df_queries['answer_duration_sec'], bins=50, kde=True)

    # Add a vertical line for the median duration, which is robust to outliers
    median_duration = df_queries['answer_duration_sec'].median()
    plt.axvline(median_duration, color='red', linestyle='--', linewidth=2, label=f'Median: {median_duration:.2f}s')

    plt.title('Distribution of Answer Segment Durations', fontsize=16)
    plt.xlabel('Duration (seconds)', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.legend()
    # Using a log scale can help visualize long-tail distributions
    # plt.xscale('log')
    plt.show()

    # Display key statistics
    print("\nStatistics for Answer Segment Durations (in seconds):")
    display(df_queries['answer_duration_sec'].describe())
else:
    print("ERROR: DataFrame 'df_queries' not found. Please run Cell 2 first.")

# 5. EDA - Analysis of Input Clip Durations

To fully understand the task's complexity, we also analyze the duration of the input video clips that contain the queries. A large discrepancy between the clip duration and the answer duration signifies a more challenging localization task (i.e., finding a "needle in a haystack"). We examine the distribution of these clip durations to understand the typical length of context the model must process.

In [None]:
if 'df_queries' in locals():
    # To get a true distribution of clip durations, we first need to look at unique clips
    unique_clips = df_queries.drop_duplicates(subset=['clip_uid'])

    plt.figure(figsize=(12, 6))
    sns.histplot(unique_clips['clip_duration_sec'], bins=50, kde=True, color='purple')

    median_clip_duration = unique_clips['clip_duration_sec'].median()
    plt.axvline(median_clip_duration, color='red', linestyle='--', linewidth=2, label=f'Median: {median_clip_duration:.2f}s')

    plt.title('Distribution of Input Clip Durations', fontsize=16)
    plt.xlabel('Duration (seconds)', fontsize=12)
    plt.ylabel('Frequency (of unique clips)', fontsize=12)
    plt.legend()
    plt.show()

    print("\nStatistics for Input Clip Durations (in seconds):")
    display(unique_clips['clip_duration_sec'].describe())
else:
    print("ERROR: DataFrame 'df_queries' not found. Please run Cell 2 first.")

# 6. EDA - Analysis of Answer Segment Position (Ground Truth Distribution)

To further understand the dataset's characteristics, we analyze the temporal position of the ground-truth answer segments within their parent video clips. We calculate the normalized start time for each answer (a value between 0.0 and 1.0, where 0.0 is the beginning of the clip and 1.0 is the end).

Visualizing the distribution of these normalized start times reveals any potential temporal bias. For instance, a peak near 0.0 would indicate that answers are frequently located at the beginning of the clips, which is valuable information for the model.

In [None]:
if 'df_queries' in locals():
    # Ensure clip duration is not zero to avoid division errors
    valid_clips = df_queries[df_queries['clip_duration_sec'] > 0].copy()

    # Calculate the normalized start time of the answer within the clip
    valid_clips['normalized_start'] = valid_clips['clip_start_sec'] / valid_clips['clip_duration_sec']

    # --- Plotting the distribution ---
    plt.figure(figsize=(12, 6))
    sns.histplot(valid_clips['normalized_start'], bins=50, kde=True, color='teal')

    plt.title('Distribution of Normalized Answer Start Times', fontsize=16)
    plt.xlabel('Normalized Start Position (0.0 = Clip Start, 1.0 = Clip End)', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.xlim(0, 1) # Ensure the x-axis is bounded between 0 and 1
    plt.show()

    print("\nStatistics for Normalized Answer Start Times:")
    display(valid_clips['normalized_start'].describe())
else:
    print("ERROR: DataFrame 'df_queries' not found. Please run Cell 2 first.")