## 1. Environment Setup

In [None]:
# Clone the repository
!git clone https://github.com/T-Larm/aml-2025-mistake-detection-gp.git

In [None]:
%cd aml-2025-mistake-detection-gp
!git pull origin main
!git submodule update --init --recursive

In [None]:
# Install requirements
!pip install -r requirements.txt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 2. Path Configuration

**‚ö†Ô∏è Modify these paths according to your Google Drive structure!**

In [None]:
import os

# ================= PATH CONFIGURATION =================
# Modify these paths according to your Google Drive!

# Project root (in Colab)
PROJECT_ROOT = "/content/aml-2025-mistake-detection-gp"

# Annotations (from the cloned repo)
ANNOTATIONS_PATH = os.path.join(PROJECT_ROOT, "annotations/complete_step_annotations.json")

# Split file
SPLIT_FILE = os.path.join(PROJECT_ROOT, "er_annotations/recordings_combined_splits.json")

# EgoVLP features on Google Drive
# ‚ö†Ô∏è MODIFY THIS PATH according to your Drive structure!
EGOVLP_FEATURES_DIR = "/content/drive/MyDrive/AMLproject/Captain_Cook_dataset/features/segments/egovlp"

# ActionFormer predictions (if using Route B)
# Set to None if not using predicted boundaries
ACTIONFORMER_PREDICTIONS_PATH = None  # e.g., "/content/drive/MyDrive/.../actionformer_predictions.json"

# Output directory (save results to Drive for persistence)
OUTPUT_DIR = "/content/drive/MyDrive/AMLproject/extension_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=== Path Configuration ===")
print(f"Project root: {PROJECT_ROOT}")
print(f"Annotations: {ANNOTATIONS_PATH}")
print(f"EgoVLP features: {EGOVLP_FEATURES_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

In [None]:
# Verify paths exist
print("=== Verifying Paths ===")

# Check annotations
if os.path.exists(ANNOTATIONS_PATH):
    print(f"‚úÖ Annotations file found")
else:
    print(f"‚ùå Annotations file NOT found: {ANNOTATIONS_PATH}")

# Check EgoVLP features
if os.path.exists(EGOVLP_FEATURES_DIR):
    files = os.listdir(EGOVLP_FEATURES_DIR)
    npz_files = [f for f in files if f.endswith('.npz')]
    print(f"‚úÖ EgoVLP features found: {len(npz_files)} .npz files")
    print(f"   Sample files: {npz_files[:5]}")
else:
    print(f"‚ùå EgoVLP features NOT found: {EGOVLP_FEATURES_DIR}")

# Check split file
if os.path.exists(SPLIT_FILE):
    print(f"‚úÖ Split file found")
else:
    print(f"‚ùå Split file NOT found: {SPLIT_FILE}")

In [None]:
# Check EgoVLP feature file structure
import numpy as np

# Find a sample file
sample_files = [f for f in os.listdir(EGOVLP_FEATURES_DIR) if f.endswith('.npz')][:1]
if sample_files:
    sample_path = os.path.join(EGOVLP_FEATURES_DIR, sample_files[0])
    data = np.load(sample_path)
    print(f"Sample file: {sample_files[0]}")
    print(f"Keys: {list(data.keys())}")
    for key in data.keys():
        print(f"  {key}: shape = {data[key].shape}, dtype = {data[key].dtype}")

## 3. Load Step Localization Module

In [None]:
import sys
sys.path.append(PROJECT_ROOT)

from extension.step_localization import (
    StepLocalizer,
    PredictedBoundaryLocalizer,
    prepare_dataset_for_task_verification,
    compare_gt_vs_predicted
)
import json
import numpy as np

print("‚úÖ Step localization module loaded successfully!")

## 4. Route A: Ground Truth Boundaries

This is the **upper bound** baseline. Using perfect step boundaries from annotations.

In [None]:
# Initialize GT Localizer
gt_localizer = StepLocalizer(
    annotations_path=ANNOTATIONS_PATH,
    features_dir=EGOVLP_FEATURES_DIR,
    fps=1.0,  # EgoVLP features are extracted at 1 FPS
    feature_key='arr_0'
)

In [None]:
# Test with a single video
# Find a video that has both annotations and features
with open(ANNOTATIONS_PATH, 'r') as f:
    annotations = json.load(f)

# Get list of available feature files
available_features = set()
for f in os.listdir(EGOVLP_FEATURES_DIR):
    if f.endswith('.npz'):
        # Extract recording_id from filename: "9_8_360p_224.mp4_1s_1s.npz" -> "9_8"
        recording_id = '_'.join(f.split('_')[:2])
        available_features.add(recording_id)

# Find videos with both annotations and features
annotated_ids = set(annotations.keys())
common_ids = annotated_ids.intersection(available_features)
print(f"Videos with both annotations and features: {len(common_ids)}")
print(f"Sample IDs: {list(common_ids)[:10]}")

In [None]:
# Process a single video
test_id = list(common_ids)[0]
print(f"\n=== Processing video: {test_id} ===")

video_data = gt_localizer.process_video(test_id)

if video_data:
    print(f"\nVideo: {video_data.recording_id}")
    print(f"Activity: {video_data.activity_name}")
    print(f"Number of steps: {len(video_data.steps)}")
    print(f"Video label (0=correct, 1=has errors): {video_data.video_label}")
    
    print("\nSteps:")
    for i, step in enumerate(video_data.steps):
        error_str = "‚ùå ERROR" if step.has_errors else "‚úì"
        print(f"  [{i+1}] Step {step.step_id}: {step.start_time:.1f}s - {step.end_time:.1f}s {error_str}")
        print(f"       {step.description[:60]}...")
        print(f"       Embedding shape: {step.embedding.shape}")

### 4.1 Process All Available Videos (Route A)

In [None]:
# Process all videos that have features
print(f"Processing {len(common_ids)} videos with GT boundaries...")

gt_results = gt_localizer.process_all_videos(list(common_ids))

In [None]:
# Statistics
num_steps_list = [len(v.steps) for v in gt_results.values()]
labels = [v.video_label for v in gt_results.values()]

print("\n=== Route A Statistics (GT Boundaries) ===")
print(f"Total videos processed: {len(gt_results)}")
print(f"Videos with errors: {sum(labels)} ({sum(labels)/len(labels)*100:.1f}%)")
print(f"Videos without errors: {len(labels) - sum(labels)} ({(len(labels)-sum(labels))/len(labels)*100:.1f}%)")
print(f"Avg steps per video: {np.mean(num_steps_list):.1f}")
print(f"Min/Max steps: {min(num_steps_list)} / {max(num_steps_list)}")

### 4.2 Prepare Dataset for Substep 2

In [None]:
# Get the max steps for padding
max_steps = max(len(vd.steps) for vd in gt_results.values())
print(f"Max steps in dataset: {max_steps}")

# Prepare data arrays
all_embeddings = []
all_labels = []
all_masks = []
all_ids = []

for recording_id, video_data in gt_results.items():
    embeddings, mask, _ = gt_localizer.get_step_embeddings_matrix(
        video_data,
        pad_to_length=max_steps
    )
    all_embeddings.append(embeddings)
    all_labels.append(video_data.video_label)
    all_masks.append(mask)
    all_ids.append(recording_id)

# Stack into arrays
gt_dataset = {
    'embeddings': np.stack(all_embeddings, axis=0),  # (N, max_steps, 256)
    'labels': np.array(all_labels),                   # (N,)
    'masks': np.stack(all_masks, axis=0),             # (N, max_steps)
    'recording_ids': all_ids,
    'max_steps': max_steps
}

print(f"\n=== Dataset Ready for Substep 2 ===")
print(f"Embeddings shape: {gt_dataset['embeddings'].shape}")
print(f"Labels shape: {gt_dataset['labels'].shape}")
print(f"Masks shape: {gt_dataset['masks'].shape}")

In [None]:
# Save dataset to Google Drive
output_path = os.path.join(OUTPUT_DIR, "gt_step_embeddings.npz")
np.savez(
    output_path,
    embeddings=gt_dataset['embeddings'],
    labels=gt_dataset['labels'],
    masks=gt_dataset['masks'],
    recording_ids=np.array(gt_dataset['recording_ids'], dtype=object),
    max_steps=gt_dataset['max_steps']
)
print(f"‚úÖ Dataset saved to: {output_path}")

## 5. Route B: Predicted Boundaries (ActionFormer)

This evaluates the **end-to-end system** using step boundaries predicted by ActionFormer.

**‚ö†Ô∏è Skip this section if you don't have ActionFormer predictions yet.**

In [None]:
# Only run if you have ActionFormer predictions
USE_PREDICTED_BOUNDARIES = False  # Set to True when you have predictions

if USE_PREDICTED_BOUNDARIES and ACTIONFORMER_PREDICTIONS_PATH:
    pred_localizer = PredictedBoundaryLocalizer(
        features_dir=EGOVLP_FEATURES_DIR,
        predictions_path=ACTIONFORMER_PREDICTIONS_PATH,
        fps=1.0,
        confidence_threshold=0.3,  # Filter low confidence predictions
        nms_threshold=0.5,          # Remove overlapping predictions
        max_predictions=30          # Limit max steps per video
    )
    print("‚úÖ Predicted boundary localizer initialized")
else:
    print("‚ö†Ô∏è Skipping Route B - No ActionFormer predictions available")

In [None]:
# Example: If you have predictions in a different format, you can set them manually
if USE_PREDICTED_BOUNDARIES:
    # Example format for ActionFormer predictions:
    # {
    #     "9_8": [
    #         {"start": 0, "end": 70, "confidence": 0.8, "label": 1},
    #         {"start": 72, "end": 96, "confidence": 0.75, "label": 2},
    #         ...
    #     ]
    # }
    
    # Load your predictions
    with open(ACTIONFORMER_PREDICTIONS_PATH, 'r') as f:
        predictions = json.load(f)
    
    pred_localizer.set_predictions(predictions)
    
    # Process videos
    pred_results = pred_localizer.process_all_videos(list(common_ids))
    
    print(f"\n=== Route B Statistics (Predicted Boundaries) ===")
    print(f"Total videos processed: {len(pred_results)}")

### 5.1 Compare GT vs Predicted (when available)

In [None]:
if USE_PREDICTED_BOUNDARIES:
    comparison = compare_gt_vs_predicted(
        gt_localizer,
        pred_localizer,
        list(common_ids)
    )
    
    # Save comparison
    comparison_path = os.path.join(OUTPUT_DIR, "gt_vs_predicted_comparison.json")
    with open(comparison_path, 'w') as f:
        json.dump(comparison, f, indent=2)
    print(f"\n‚úÖ Comparison saved to: {comparison_path}")

## 6. Summary & Next Steps

### What we have now:
1. **GT Step Embeddings** (`gt_step_embeddings.npz`)
   - Shape: `(N, max_steps, 256)`
   - Ready for Substep 2 (Task Verification)

### Next Steps:
1. **Substep 2**: Train a Transformer classifier on step embeddings to predict video-level correctness
2. **Substep 3**: Encode task graph nodes with EgoVLP text encoder, match with visual features
3. **Substep 4**: Train GNN classifier on the matched task graph

In [None]:
# Summary
print("="*60)
print("Extension Substep 1 Complete!")
print("="*60)
print(f"\nüìä Results saved to: {OUTPUT_DIR}")
print(f"\nüìÅ Files created:")
for f in os.listdir(OUTPUT_DIR):
    fpath = os.path.join(OUTPUT_DIR, f)
    size = os.path.getsize(fpath) / (1024*1024)  # MB
    print(f"   - {f} ({size:.2f} MB)")