## 1. Environment Setup

## Install Python 3.10

### Subtask:
Install Python 3.10 and its necessary dependencies using apt-get.


In [None]:
print("Installing Python 3.10...")
!sudo apt-get update -y
!apt-get install python3.10 python3.10-dev python3.10-venv -y
print("Python 3.10 installation complete.")

In [None]:
# Update alternatives to point 'python3' to 'python3.10'
# This command adds python3.10 as an alternative for python3 with a priority of 1.
!update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1

# This command then explicitly sets python3 to use python3.10
!update-alternatives --set python3 /usr/bin/python3.10

In [None]:
# Install pip for Python 3 (now linked to Python 3.10)
# Using apt-get is the recommended way for system-wide pip in Debian/Ubuntu
!apt-get install python3-pip -y

In [None]:
# Install pip for Python 3.10
# Use ensurepip to guarantee pip is installed for the new python version
!python3 -m ensurepip --upgrade
# Upgrade pip, setuptools, and wheel for the new Python environment
!python3 -m pip install --upgrade pip setuptools wheel

In [None]:
# Verify the Python version after switching
!python3 --version
# Python 3.10.12
# After verifying, please remember to restart the Colab runtime.

In [None]:
# Clone the repository
!git clone https://github.com/T-Larm/aml-2025-mistake-detection-gp.git

In [None]:
%cd aml-2025-mistake-detection-gp
!git pull origin main
!git submodule update --init --recursive

In [None]:
# Install requirements
!pip install -r requirements.txt

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 2. Path Configuration

**‚ö†Ô∏è Modify these paths according to your Google Drive structure!**

In [None]:
import os

# ================= PATH CONFIGURATION =================
# Modify these paths according to your Google Drive!

# Project root (in Colab)
PROJECT_ROOT = "/content/aml-2025-mistake-detection-gp"

# Annotations (from the cloned repo submodule)
# ‚ö†Ô∏è Note: The submodule path is annotations/annotation_json/complete_step_annotations.json
ANNOTATIONS_PATH = os.path.join(PROJECT_ROOT, "annotations/annotation_json/complete_step_annotations.json")

# Split file
SPLIT_FILE = os.path.join(PROJECT_ROOT, "er_annotations/recordings_combined_splits.json")

# EgoVLP features on Google Drive
# ‚ö†Ô∏è MODIFY THIS PATH according to your Drive structure!
EGOVLP_FEATURES_DIR = "/content/drive/MyDrive/AMLproject/our_features/gopro/segments/egovlp"

# Output directory (save results to Drive for persistence)
OUTPUT_DIR = "/content/drive/MyDrive/AMLproject/extension1_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=== Path Configuration ===")
print(f"Project root: {PROJECT_ROOT}")
print(f"Annotations: {ANNOTATIONS_PATH}")
print(f"EgoVLP features: {EGOVLP_FEATURES_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

In [None]:
# Verify paths exist
print("=== Verifying Paths ===")

# Check annotations
if os.path.exists(ANNOTATIONS_PATH):
    print(f"‚úÖ Annotations file found")
else:
    print(f"‚ùå Annotations file NOT found: {ANNOTATIONS_PATH}")

# Check EgoVLP features
if os.path.exists(EGOVLP_FEATURES_DIR):
    files = os.listdir(EGOVLP_FEATURES_DIR)
    npz_files = [f for f in files if f.endswith('.npz')]
    print(f"‚úÖ EgoVLP features found: {len(npz_files)} .npz files")
    print(f"   Sample files: {npz_files[:5]}")
else:
    print(f"‚ùå EgoVLP features NOT found: {EGOVLP_FEATURES_DIR}")

# Check split file
if os.path.exists(SPLIT_FILE):
    print(f"‚úÖ Split file found")
else:
    print(f"‚ùå Split file NOT found: {SPLIT_FILE}")

In [None]:
# Check EgoVLP feature file structure
import numpy as np

# Find a sample file
sample_files = [f for f in os.listdir(EGOVLP_FEATURES_DIR) if f.endswith('.npz')][:1]
if sample_files:
    sample_path = os.path.join(EGOVLP_FEATURES_DIR, sample_files[0])
    data = np.load(sample_path)
    print(f"Sample file: {sample_files[0]}")
    print(f"Keys: {list(data.keys())}")
    for key in data.keys():
        print(f"  {key}: shape = {data[key].shape}, dtype = {data[key].dtype}")

## 3. Load Step Localization Module

In [None]:
import sys
sys.path.append(PROJECT_ROOT)

from extension.step_localization import (
    StepLocalizer,
    PredictedBoundaryLocalizer,
    prepare_dataset_for_task_verification,
    compare_gt_vs_predicted
)
import json
import numpy as np

print("‚úÖ Step localization module loaded successfully!")

## 4. Route A: Ground Truth Boundaries

This is the **upper bound** baseline. Using perfect step boundaries from annotations.

In [None]:
# Initialize GT Localizer
gt_localizer = StepLocalizer(
    annotations_path=ANNOTATIONS_PATH,
    features_dir=EGOVLP_FEATURES_DIR,
    fps=1.0,  # EgoVLP features are extracted at 1 FPS
    feature_key='arr_0'
)

In [None]:
# Test with a single video
# Find a video that has both annotations and features
with open(ANNOTATIONS_PATH, 'r') as f:
    annotations = json.load(f)

# Get list of available feature files
available_features = set()
for f in os.listdir(EGOVLP_FEATURES_DIR):
    if f.endswith('.npz'):
        # Extract recording_id from filename: "9_8_360p_224.mp4_1s_1s.npz" -> "9_8"
        recording_id = '_'.join(f.split('_')[:2])
        available_features.add(recording_id)

# Find videos with both annotations and features
annotated_ids = set(annotations.keys())
common_ids = annotated_ids.intersection(available_features)
print(f"Videos with both annotations and features: {len(common_ids)}")
print(f"Sample IDs: {list(common_ids)[:10]}")

In [None]:
# Process a single video
test_id = list(common_ids)[0]
print(f"\n=== Processing video: {test_id} ===")

video_data = gt_localizer.process_video(test_id)

if video_data:
    print(f"\nVideo: {video_data.recording_id}")
    print(f"Activity: {video_data.activity_name}")
    print(f"Number of steps: {len(video_data.steps)}")
    print(f"Video label (0=correct, 1=has errors): {video_data.video_label}")

    print("\nSteps:")
    for i, step in enumerate(video_data.steps):
        error_str = "‚ùå ERROR" if step.has_errors else "‚úì"
        print(f"  [{i+1}] Step {step.step_id}: {step.start_time:.1f}s - {step.end_time:.1f}s {error_str}")
        print(f"       {step.description[:60]}...")
        print(f"       Embedding shape: {step.embedding.shape}")

### 4.1 Process All Available Videos (Route A)

In [None]:
# Process all videos that have features
print(f"Processing {len(common_ids)} videos with GT boundaries...")

gt_results = gt_localizer.process_all_videos(list(common_ids))

In [None]:
# Statistics
num_steps_list = [len(v.steps) for v in gt_results.values()]
labels = [v.video_label for v in gt_results.values()]

print("\n=== Route A Statistics (GT Boundaries) ===")
print(f"Total videos processed: {len(gt_results)}")
print(f"Videos with errors: {sum(labels)} ({sum(labels)/len(labels)*100:.1f}%)")
print(f"Videos without errors: {len(labels) - sum(labels)} ({(len(labels)-sum(labels))/len(labels)*100:.1f}%)")
print(f"Avg steps per video: {np.mean(num_steps_list):.1f}")
print(f"Min/Max steps: {min(num_steps_list)} / {max(num_steps_list)}")

### 4.2 Prepare Dataset for Substep 2

In [None]:
# Get the max steps for padding
max_steps = max(len(vd.steps) for vd in gt_results.values())
print(f"Max steps in dataset: {max_steps}")

# Prepare data arrays
all_embeddings = []
all_labels = []
all_masks = []
all_ids = []

for recording_id, video_data in gt_results.items():
    embeddings, mask, _ = gt_localizer.get_step_embeddings_matrix(
        video_data,
        pad_to_length=max_steps
    )
    all_embeddings.append(embeddings)
    all_labels.append(video_data.video_label)
    all_masks.append(mask)
    all_ids.append(recording_id)

# Stack into arrays
gt_dataset = {
    'embeddings': np.stack(all_embeddings, axis=0),  # (N, max_steps, 256)
    'labels': np.array(all_labels),                   # (N,)
    'masks': np.stack(all_masks, axis=0),             # (N, max_steps)
    'recording_ids': all_ids,
    'max_steps': max_steps
}

print(f"\n=== Dataset Ready for Substep 2 ===")
print(f"Embeddings shape: {gt_dataset['embeddings'].shape}")
print(f"Labels shape: {gt_dataset['labels'].shape}")
print(f"Masks shape: {gt_dataset['masks'].shape}")

In [None]:
# Save dataset to Google Drive
output_path = os.path.join(OUTPUT_DIR, "gt_step_embeddings.npz")
np.savez(
    output_path,
    embeddings=gt_dataset['embeddings'],
    labels=gt_dataset['labels'],
    masks=gt_dataset['masks'],
    recording_ids=np.array(gt_dataset['recording_ids'], dtype=object),
    max_steps=gt_dataset['max_steps']
)
print(f"‚úÖ Dataset saved to: {output_path}")

## 5. Route B: HiERO Model-based Boundaries

This evaluates the **end-to-end system** using step boundaries predicted by HiERO model.

Uses hierarchical clustering from HiERO for better step detection.

### 5.1 Setup HiERO Environment

**‚ö†Ô∏è IMPORTANT**: Run this cell ONCE at the beginning. It will install all HiERO dependencies from requirements.txt.

**Note**: If you see version conflicts (e.g., networkx==3.5), the installer will auto-resolve to a compatible version.

In [None]:
# Setup HiERO Environment (Fixed Dependencies)
import os

# Clone HiERO repository
if not os.path.exists('/content/HiERO'):
    !git clone https://github.com/T-Larm/HiERO_for_egovlp.git /content/HiERO
    print("‚úÖ HiERO repository cloned")
else:
    print("‚úÖ HiERO repository already exists")

print("\n" + "="*60)
print("Installing HiERO Environment (This may take 3-5 minutes)")
print("="*60)

# Install from HiERO's requirements.txt directly
# This ensures all dependencies are properly resolved
if os.path.exists('/content/HiERO/requirements.txt'):
    print("\nüì¶ Installing from HiERO requirements.txt...")
    print("   (networkx version will be auto-corrected if needed)\n")
    
    # Install with PyG extra index
    !pip install -q -r /content/HiERO/requirements.txt \
        -f https://data.pyg.org/whl/torch-2.4.0+cu124.html \
        --extra-index-url https://download.pytorch.org/whl/cu124 \
        || echo "‚ö†Ô∏è Some packages may have version conflicts, continuing..."
    
    print("\n‚úÖ HiERO requirements installed")
else:
    # Fallback: install core dependencies manually
    print("‚ö†Ô∏è No requirements.txt found, installing core dependencies...")
    !pip install -q torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu124
    !pip install -q torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.4.0+cu124.html
    !pip install -q einops torch_kmeans tqdm PyYAML networkx scikit-learn timm transformers

print("\n‚úÖ Environment setup complete!")
print("\nüìã Verify installation:")
!pip list | grep -E "torch|networkx|einops|sklearn"

In [None]:
# HiERO configuration
USE_HIERO = True  # Set to False to skip HiERO route

# HiERO checkpoint path - MODIFY THIS!
HIERO_CHECKPOINT = "/content/drive/MyDrive/AMLproject/hiero_egovlp/hiero_egovlp.pth"

# Check if checkpoint exists
if USE_HIERO:
    if os.path.exists(HIERO_CHECKPOINT):
        print("‚úÖ HiERO route enabled")
        print(f"Checkpoint found: {HIERO_CHECKPOINT}")
        checkpoint_size = os.path.getsize(HIERO_CHECKPOINT) / (1024*1024)
        print(f"Checkpoint size: {checkpoint_size:.1f} MB")
    else:
        print(f"‚ùå Checkpoint not found: {HIERO_CHECKPOINT}")
        print("‚ö†Ô∏è Please update HIERO_CHECKPOINT path")
        USE_HIERO = False
else:
    print("‚ö†Ô∏è Skipping HiERO route")

### 5.2 Load HiERO Model

In [None]:
if USE_HIERO:
    import sys
    sys.path.insert(0, '/content/HiERO')

    import torch
    import yaml
    from pathlib import Path
    from models.hiero import HiERO
    from torch_geometric.data import Data, Batch

    # Check device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load checkpoint
    checkpoint = torch.load(HIERO_CHECKPOINT, map_location='cpu')
    print(f"‚úÖ Checkpoint loaded")
    print(f"Checkpoint keys: {list(checkpoint.keys())}")

    # Try to load config from checkpoint or file
    if 'config' in checkpoint:
        config = checkpoint['config']
        print("‚úÖ Config loaded from checkpoint")
    else:
        # Try to load from HiERO repo config files
        config_path = '/content/HiERO/configs/egovlp.yaml'
        if os.path.exists(config_path):
            with open(config_path, 'r') as f:
                config = yaml.safe_load(f)
            print(f"‚úÖ Config loaded from {config_path}")
        else:
            # Use default config
            print("‚ö†Ô∏è Using default config")
            config = {
                'model': {
                    'conv': {'name': 'TDGC', 'hidden_size': 256},
                    'k': 2.0,
                    'n_layers': 2,
                    'hidden_size': 256,
                    'depth': 3,
                    'dropout': 0.1,
                    'pool': 'batch_subsampling',
                    'n_clusters': 8,
                    'clustering_sample_points': 32,
                    'clustering_at_inference': False
                }
            }

    # Initialize HiERO model
    print("\nInitializing HiERO model...")
    model_config = config.get('model', {})
    
    # Handle conv config
    if 'conv' in model_config:
        conv_config = model_config['conv']
        # Expand conv config if it's just a string
        if isinstance(conv_config, str):
            model_config['conv'] = {'name': conv_config}
    
    hiero_model = HiERO(
        input_size=256,  # EgoVLP features are 256-dim
        **model_config
    )
    print(f"‚úÖ Model initialized")

    # Load state dict
    if 'state_dict' in checkpoint:
        state_dict = checkpoint['state_dict']
    elif 'model' in checkpoint:
        state_dict = checkpoint['model']
    else:
        state_dict = checkpoint

    # Try to load weights
    try:
        # Remove 'module.' prefix if present (from DataParallel)
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k.replace('module.', '') if k.startswith('module.') else k
            new_state_dict[name] = v
        
        hiero_model.load_state_dict(new_state_dict, strict=True)
        print("‚úÖ Model weights loaded (strict mode)")
    except Exception as e:
        print(f"‚ö†Ô∏è Strict loading failed: {e}")
        try:
            hiero_model.load_state_dict(new_state_dict, strict=False)
            print("‚úÖ Model weights loaded (non-strict mode)")
        except Exception as e2:
            print(f"‚ùå Failed to load weights: {e2}")
            USE_HIERO = False

    if USE_HIERO:
        hiero_model = hiero_model.to(device)
        hiero_model.eval()
        
        # Count parameters
        total_params = sum(p.numel() for p in hiero_model.parameters())
        trainable_params = sum(p.numel() for p in hiero_model.parameters() if p.requires_grad)
        
        print(f"\n‚úÖ HiERO model ready!")
        print(f"   Total parameters: {total_params/1e6:.2f}M")
        print(f"   Trainable parameters: {trainable_params/1e6:.2f}M")
        print(f"   Device: {device}")

### 5.3 HiERO Step Detection Function

In [None]:
if USE_HIERO:
    from sklearn.cluster import SpectralClustering
    import torch.nn.functional as F

    def detect_steps_with_hiero(features, model, device, n_clusters='auto', use_spectral=True):
        """
        Detect step boundaries using HiERO model.
        
        Uses HiERO's hierarchical features + clustering for step detection.

        Args:
            features: (T, 256) numpy array - EgoVLP features
            model: HiERO model
            device: torch device
            n_clusters: number of clusters ('auto' or int)
            use_spectral: use spectral clustering (like HiERO paper)

        Returns:
            boundaries: list of (start, end) tuples (frame indices)
            step_embeddings: (num_steps, 256) array
        """
        T, D = features.shape

        # Auto-estimate clusters based on video length
        if n_clusters == 'auto':
            # Heuristic: ~1 step per 30 seconds (at 1 FPS)
            n_clusters = max(3, min(T // 30, 15))
            n_clusters = int(n_clusters)

        print(f"  Processing {T} frames ‚Üí {n_clusters} clusters")

        # Convert to torch
        x = torch.from_numpy(features).float().to(device)

        # Create temporal graph (sequential connections)
        edge_index = []
        for i in range(T - 1):
            edge_index.append([i, i + 1])
            edge_index.append([i + 1, i])
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous().to(device)

        # Create PyG graph data
        graph_data = Data(x=x, edge_index=edge_index)
        graph_data.batch = torch.zeros(T, dtype=torch.long, device=device)
        
        # HiERO forward pass
        with torch.no_grad():
            try:
                # Forward through HiERO model
                output = model(graph_data)
                
                # Extract features from output
                if isinstance(output, Data):
                    hiero_features = output.x
                elif isinstance(output, dict):
                    hiero_features = output.get('x', output.get('features', x))
                else:
                    hiero_features = output
                
                # Ensure correct shape
                if len(hiero_features) > T:
                    hiero_features = hiero_features[:T]
                
                hiero_features = hiero_features.cpu().numpy()
                print(f"  ‚úì HiERO forward pass: {hiero_features.shape}")
                
            except Exception as e:
                print(f"  ‚ö†Ô∏è HiERO forward failed ({e}), using original features")
                hiero_features = features

        # Normalize features for clustering
        features_norm = hiero_features / (np.linalg.norm(hiero_features, axis=1, keepdims=True) + 1e-8)

        # Clustering on HiERO features
        if use_spectral:
            # Spectral clustering (as in HiERO paper)
            affinity = features_norm @ features_norm.T
            clustering = SpectralClustering(
                n_clusters=n_clusters,
                affinity='precomputed',
                random_state=42,
                assign_labels='kmeans'
            )
            labels = clustering.fit_predict(affinity)
            print(f"  ‚úì Spectral clustering done")
        else:
            # KMeans clustering
            from sklearn.cluster import KMeans
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            labels = kmeans.fit_predict(hiero_features)
            print(f"  ‚úì KMeans clustering done")

        # Convert cluster labels to temporal segments
        boundaries = []
        current_start = 0
        current_label = labels[0]

        for i in range(1, len(labels)):
            if labels[i] != current_label:
                # Minimum step length: 5 frames
                if i - current_start >= 5:
                    boundaries.append((current_start, i - 1))
                    current_start = i
                    current_label = labels[i]

        # Add last segment
        if len(labels) - current_start >= 5:
            boundaries.append((current_start, len(labels) - 1))

        # Merge very short segments with neighbors
        if len(boundaries) > 1:
            merged_boundaries = []
            i = 0
            while i < len(boundaries):
                start, end = boundaries[i]
                duration = end - start + 1
                
                # If segment too short, merge with next
                if duration < 5 and i < len(boundaries) - 1:
                    next_start, next_end = boundaries[i + 1]
                    merged_boundaries.append((start, next_end))
                    i += 2
                else:
                    merged_boundaries.append((start, end))
                    i += 1
            boundaries = merged_boundaries

        # Extract step-level embeddings (average pooling)
        step_embeddings = []
        for start, end in boundaries:
            step_emb = hiero_features[start:end+1].mean(axis=0)
            step_embeddings.append(step_emb)

        step_embeddings = np.stack(step_embeddings, axis=0) if step_embeddings else np.zeros((0, 256))

        print(f"  ‚úì Detected {len(boundaries)} steps")
        return boundaries, step_embeddings

    print("‚úÖ HiERO detection function defined")

### 5.4 Process Videos with HiERO

In [None]:
if USE_HIERO:
    from tqdm import tqdm

    # Process all videos with HiERO
    hiero_results = {}
    failed_videos = []

    print(f"\nProcessing {len(common_ids)} videos with HiERO model...")
    print("=" * 60)

    for recording_id in tqdm(list(common_ids), desc="Processing videos"):
        try:
            # Find feature file for this video
            feature_files = [f for f in os.listdir(EGOVLP_FEATURES_DIR)
                           if f.startswith(recording_id) and f.endswith('.npz')]

            if not feature_files:
                print(f"‚ö†Ô∏è No features found for {recording_id}")
                failed_videos.append(recording_id)
                continue

            # Load EgoVLP features
            feature_path = os.path.join(EGOVLP_FEATURES_DIR, feature_files[0])
            data = np.load(feature_path)
            
            # Try different keys
            if 'arr_0' in data:
                features = data['arr_0']
            elif 'features' in data:
                features = data['features']
            else:
                features = data[data.files[0]]

            # Detect steps using HiERO
            boundaries, step_embeddings = detect_steps_with_hiero(
                features, 
                hiero_model, 
                device,
                n_clusters='auto',
                use_spectral=True
            )

            # Get video label from annotations
            anno = annotations[recording_id]
            has_errors = any(step.get('has_errors', False) for step in anno.get('steps', []))
            video_label = 1 if has_errors else 0

            # Store results
            hiero_results[recording_id] = {
                'boundaries': boundaries,
                'step_embeddings': step_embeddings,
                'video_label': video_label,
                'activity_name': anno.get('activity_name', 'unknown'),
                'num_frames': len(features)
            }

        except Exception as e:
            print(f"\n‚ùå Error processing {recording_id}: {e}")
            failed_videos.append(recording_id)
            continue

    print("\n" + "=" * 60)
    print(f"‚úÖ Successfully processed: {len(hiero_results)} videos")
    if failed_videos:
        print(f"‚ùå Failed: {len(failed_videos)} videos")
        print(f"   Failed IDs: {failed_videos[:5]}{'...' if len(failed_videos) > 5 else ''}")

### 5.5 HiERO Statistics

In [None]:
if USE_HIERO:
    # Statistics
    hiero_num_steps = [len(v['boundaries']) for v in hiero_results.values()]
    hiero_labels = [v['video_label'] for v in hiero_results.values()]

    print("\n=== Route B Statistics (HiERO Boundaries) ===")
    print(f"Total videos processed: {len(hiero_results)}")
    print(f"Videos with errors: {sum(hiero_labels)} ({sum(hiero_labels)/len(hiero_labels)*100:.1f}%)")
    print(f"Videos without errors: {len(hiero_labels) - sum(hiero_labels)}")
    print(f"Avg steps per video: {np.mean(hiero_num_steps):.1f}")
    print(f"Min/Max steps: {min(hiero_num_steps)} / {max(hiero_num_steps)}")

### 5.6 Prepare HiERO Dataset for Substep 2

In [None]:
if USE_HIERO:
    # Prepare padded dataset
    max_steps_hiero = max(len(v['boundaries']) for v in hiero_results.values())
    print(f"Max steps in HiERO dataset: {max_steps_hiero}")

    hiero_embeddings = []
    hiero_masks = []
    hiero_labels = []
    hiero_ids = []

    for recording_id, data in hiero_results.items():
        step_emb = data['step_embeddings']  # (num_steps, 256)
        num_steps = len(step_emb)

        # Pad embeddings
        padded_emb = np.zeros((max_steps_hiero, 256), dtype=np.float32)
        mask = np.zeros(max_steps_hiero, dtype=bool)

        padded_emb[:num_steps] = step_emb
        mask[:num_steps] = True

        hiero_embeddings.append(padded_emb)
        hiero_masks.append(mask)
        hiero_labels.append(data['video_label'])
        hiero_ids.append(recording_id)

    # Stack into arrays
    hiero_dataset = {
        'embeddings': np.stack(hiero_embeddings, axis=0),  # (N, max_steps, 256)
        'labels': np.array(hiero_labels),                   # (N,)
        'masks': np.stack(hiero_masks, axis=0),             # (N, max_steps)
        'recording_ids': hiero_ids,
        'max_steps': max_steps_hiero
    }

    print(f"\n=== HiERO Dataset Ready for Substep 2 ===")
    print(f"Embeddings shape: {hiero_dataset['embeddings'].shape}")
    print(f"Labels shape: {hiero_dataset['labels'].shape}")
    print(f"Masks shape: {hiero_dataset['masks'].shape}")

In [None]:
if USE_HIERO:
    # Save HiERO dataset
    hiero_output_path = os.path.join(OUTPUT_DIR, "hiero_step_embeddings.npz")
    np.savez(
        hiero_output_path,
        embeddings=hiero_dataset['embeddings'],
        labels=hiero_dataset['labels'],
        masks=hiero_dataset['masks'],
        recording_ids=np.array(hiero_dataset['recording_ids'], dtype=object),
        max_steps=hiero_dataset['max_steps']
    )
    print(f"‚úÖ HiERO dataset saved to: {hiero_output_path}")

    # Also save boundaries as JSON
    boundaries_json = {
        rec_id: {
            'boundaries': [(int(s), int(e)) for s, e in data['boundaries']],
            'num_steps': len(data['boundaries']),
            'video_label': int(data['video_label']),
            'activity': data['activity_name']
        }
        for rec_id, data in hiero_results.items()
    }

    boundaries_path = os.path.join(OUTPUT_DIR, "hiero_step_boundaries.json")
    with open(boundaries_path, 'w') as f:
        json.dump(boundaries_json, f, indent=2)
    print(f"‚úÖ HiERO boundaries saved to: {boundaries_path}")

### 5.7 Compare GT vs HiERO

In [None]:
if USE_HIERO:
    # Compare GT vs HiERO detected steps
    comparison = []

    for recording_id in common_ids:
        if recording_id not in hiero_results:
            continue

        gt_video = gt_results.get(recording_id)
        hiero_data = hiero_results.get(recording_id)

        if gt_video and hiero_data:
            gt_num_steps = len(gt_video.steps)
            hiero_num_steps = len(hiero_data['boundaries'])

            comparison.append({
                'recording_id': recording_id,
                'gt_steps': gt_num_steps,
                'hiero_steps': hiero_num_steps,
                'difference': hiero_num_steps - gt_num_steps,
                'video_label': hiero_data['video_label']
            })

    # Statistics
    differences = [c['difference'] for c in comparison]
    print(f"\n=== GT vs HiERO Comparison ===")
    print(f"Videos compared: {len(comparison)}")
    print(f"Average difference (HiERO - GT): {np.mean(differences):.2f} steps")
    print(f"Std dev: {np.std(differences):.2f}")
    print(f"Min/Max difference: {min(differences)} / {max(differences)}")

    # Save comparison
    comparison_path = os.path.join(OUTPUT_DIR, "gt_vs_hiero_comparison.json")
    with open(comparison_path, 'w') as f:
        json.dump(comparison, f, indent=2)
    print(f"‚úÖ Comparison saved to: {comparison_path}")

## 6. Summary & Next Steps

### What we have now:
1. **GT Step Embeddings** (`gt_step_embeddings.npz`) - Upper bound baseline
   - Shape: `(N, max_steps, 256)`
   - Uses perfect step boundaries from annotations
   
2. **HiERO Step Embeddings** (`hiero_step_embeddings.npz`) - Predicted boundaries
   - Shape: `(N, max_steps, 256)`
   - Uses HiERO model for step detection
   - More realistic end-to-end system performance

3. **HiERO Boundaries** (`hiero_step_boundaries.json`)
   - Predicted step boundaries for each video
   - Ready for Substeps 3 & 4 (Task Graph matching)

### Next Steps:
1. **Substep 2**: Train Transformer classifier on step embeddings
   - Test with both GT (upper bound) and HiERO (realistic) embeddings
2. **Substep 3**: Encode task graph nodes, match with HiERO visual features
3. **Substep 4**: Train GNN classifier on matched task graph

In [None]:
# Summary
print("="*60)
print("Extension Substep 1 Complete!")
print("="*60)
print(f"\nüìä Results saved to: {OUTPUT_DIR}")
print(f"\nüìÅ Files created:")
for f in os.listdir(OUTPUT_DIR):
    fpath = os.path.join(OUTPUT_DIR, f)
    size = os.path.getsize(fpath) / (1024*1024)  # MB
    print(f"   - {f} ({size:.2f} MB)")

print(f"\nüéØ Summary:")
print(f"   - GT baseline: {len(gt_results)} videos (upper bound)")
if USE_HIERO:
    print(f"   - HiERO predictions: {len(hiero_results)} videos (realistic)")
    print(f"   - Average steps - GT: {np.mean(num_steps_list):.1f}, HiERO: {np.mean(hiero_num_steps):.1f}")

---

## üí° Alternative: Install ALL dependencies at once (Recommended)

If you want to set up the **complete environment in one go**, uncomment and run this cell instead of installing dependencies in multiple steps.

In [None]:
# # OPTION: One-shot environment setup
# # Uncomment this cell to install everything at once
# 
# print("="*70)
# print("ONE-SHOT ENVIRONMENT SETUP")
# print("="*70)
# 
# # 1. Clone repositories
# !git clone https://github.com/T-Larm/aml-2025-mistake-detection-gp.git /content/aml-2025-mistake-detection-gp
# %cd /content/aml-2025-mistake-detection-gp
# !git submodule update --init --recursive
# 
# !git clone https://github.com/T-Larm/HiERO_for_egovlp.git /content/HiERO
# 
# # 2. Install project requirements
# print("\nüì¶ Installing project requirements...")
# !pip install -q -r requirements.txt
# 
# # 3. Install HiERO requirements
# print("\nüì¶ Installing HiERO requirements...")
# !pip install -q -r /content/HiERO/requirements.txt \
#     -f https://data.pyg.org/whl/torch-2.4.0+cu124.html \
#     --extra-index-url https://download.pytorch.org/whl/cu124 \
#     || echo "‚ö†Ô∏è Version conflicts auto-resolved"
# 
# # 4. Mount Drive
# from google.colab import drive
# drive.mount('/content/drive')
# 
# print("\n‚úÖ Complete environment ready!")
# print("\n‚è≠Ô∏è  You can now skip to Section 2 (Path Configuration)")