## 1. Environment Setup

In [None]:
# Clone HiERO repository
!git clone https://github.com/sapeirone/HiERO.git
%cd HiERO

In [None]:
# Also clone your project repo for annotations
!git clone https://github.com/T-Larm/aml-2025-mistake-detection-gp.git /content/aml-project
%cd /content/aml-project
!git submodule update --init --recursive

In [None]:
# Install HiERO dependencies
%cd /content/HiERO
!pip install -r requirements.txt -f https://data.pyg.org/whl/torch-2.4.0+cu124.html --extra-index-url https://download.pytorch.org/whl/

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

## 2. Path Configuration

In [None]:
import os
import sys

# ================= PATH CONFIGURATION =================
HIERO_ROOT = "/content/HiERO"
PROJECT_ROOT = "/content/aml-project"
DRIVE_ROOT = "/content/drive/MyDrive/AMLproject"

# Add HiERO to Python path
sys.path.insert(0, HIERO_ROOT)

# EgoVLP features on Google Drive
EGOVLP_FEATURES_DIR = os.path.join(DRIVE_ROOT, "Captain_Cook_dataset/features/segments/egovlp")

# Annotations from your project
ANNOTATIONS_PATH = os.path.join(PROJECT_ROOT, "annotations/annotation_json/complete_step_annotations.json")
SPLIT_FILE = os.path.join(PROJECT_ROOT, "er_annotations/recordings_combined_splits.json")

# HiERO pretrained model - MODIFY THIS PATH!
# Option 1: If you have it in your HiERO_for_egovlp folder
HIERO_CHECKPOINT = os.path.join(DRIVE_ROOT, "HiERO_for_egovlp/hiero_egovlp/hiero_egovlp.pth")
# Option 2: Or download from HiERO's model zoo (see README)
# HIERO_CHECKPOINT = "/content/HiERO/checkpoints/hiero_egovlp.pth"

# Output directory
OUTPUT_DIR = os.path.join(DRIVE_ROOT, "substep1_hiero_outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=== Path Configuration ===")
print(f"HiERO root: {HIERO_ROOT}")
print(f"Project root: {PROJECT_ROOT}")
print(f"EgoVLP features: {EGOVLP_FEATURES_DIR}")
print(f"HiERO checkpoint: {HIERO_CHECKPOINT}")
print(f"Output: {OUTPUT_DIR}")

In [None]:
# Verify paths
print("=== Verifying Paths ===")

if os.path.exists(EGOVLP_FEATURES_DIR):
    npz_files = [f for f in os.listdir(EGOVLP_FEATURES_DIR) if f.endswith('.npz')]
    print(f"‚úÖ EgoVLP features: {len(npz_files)} files")
else:
    print(f"‚ùå Features not found: {EGOVLP_FEATURES_DIR}")

if os.path.exists(ANNOTATIONS_PATH):
    print(f"‚úÖ Annotations found")
else:
    print(f"‚ùå Annotations not found: {ANNOTATIONS_PATH}")

if os.path.exists(HIERO_CHECKPOINT):
    size_mb = os.path.getsize(HIERO_CHECKPOINT) / (1024*1024)
    print(f"‚úÖ HiERO checkpoint found ({size_mb:.1f} MB)")
else:
    print(f"‚ö†Ô∏è HiERO checkpoint not found: {HIERO_CHECKPOINT}")
    print(f"   You may need to download it or adjust the path")

## 3. Load HiERO Model

In [None]:
import torch
import torch.nn as nn
import numpy as np
from models.hiero import HiERO
import yaml

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Load checkpoint to inspect structure
checkpoint = torch.load(HIERO_CHECKPOINT, map_location='cpu')

print("Checkpoint keys:")
for key in checkpoint.keys():
    if isinstance(checkpoint[key], dict):
        print(f"  {key}: {len(checkpoint[key])} items")
    else:
        print(f"  {key}: {type(checkpoint[key])}")

In [None]:
# Load config (if available in checkpoint, otherwise use default)
if 'config' in checkpoint:
    config = checkpoint['config']
    print("Config loaded from checkpoint")
else:
    # Load from default config file
    config_path = os.path.join(HIERO_ROOT, 'configs/egovlp.yaml')
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print(f"Config loaded from: {config_path}")

print("\nConfig structure:")
print(yaml.dump(config, default_flow_style=False, indent=2)[:500] + "...")

In [None]:
# Initialize HiERO model
# Extract model parameters from config
model_config = config.get('model', {})

# Build model
model = HiERO(
    input_size=256,  # EgoVLP feature dimension
    **model_config
)

# Load pretrained weights
if 'state_dict' in checkpoint:
    state_dict = checkpoint['state_dict']
elif 'model' in checkpoint:
    state_dict = checkpoint['model']
else:
    state_dict = checkpoint

# Load weights (may need to handle key mismatches)
try:
    model.load_state_dict(state_dict, strict=True)
    print("‚úÖ Model weights loaded successfully (strict)")
except Exception as e:
    print(f"‚ö†Ô∏è Strict loading failed: {e}")
    print("Trying non-strict loading...")
    missing, unexpected = model.load_state_dict(state_dict, strict=False)
    print(f"Missing keys: {len(missing)}")
    print(f"Unexpected keys: {len(unexpected)}")

model = model.to(device)
model.eval()

print(f"\n‚úÖ HiERO model ready on {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M")

## 4. Load Annotations & Data

In [None]:
import json

# Load annotations
with open(ANNOTATIONS_PATH, 'r') as f:
    annotations = json.load(f)

print(f"Total annotated videos: {len(annotations)}")

# Find videos with both annotations and features
available_features = set()
for f in os.listdir(EGOVLP_FEATURES_DIR):
    if f.endswith('.npz'):
        recording_id = '_'.join(f.split('_')[:2])
        available_features.add(recording_id)

annotated_ids = set(annotations.keys())
common_ids = annotated_ids.intersection(available_features)

print(f"Videos with both annotations and features: {len(common_ids)}")
print(f"Sample IDs: {list(common_ids)[:5]}")

In [None]:
# Alternative: Process without annotations (if you don't have labels yet)
USE_ANNOTATIONS = True  # Set to False to skip annotations

if USE_ANNOTATIONS:
    # Load annotations
    with open(ANNOTATIONS_PATH, 'r') as f:
        annotations = json.load(f)
    print(f"Total annotated videos: {len(annotations)}")
    
    # Find videos with both annotations and features
    available_features = set()
    for f in os.listdir(EGOVLP_FEATURES_DIR):
        if f.endswith('.npz'):
            recording_id = '_'.join(f.split('_')[:2])
            available_features.add(recording_id)
    
    annotated_ids = set(annotations.keys())
    common_ids = annotated_ids.intersection(available_features)
    print(f"Videos with both annotations and features: {len(common_ids)}")
else:
    # Process all available features without annotations
    print("‚ö†Ô∏è Running without annotations - video labels will be set to -1")
    annotations = None
    common_ids = set()
    for f in os.listdir(EGOVLP_FEATURES_DIR):
        if f.endswith('.npz'):
            recording_id = '_'.join(f.split('_')[:2])
            common_ids.add(recording_id)
    print(f"Total videos with features: {len(common_ids)}")

print(f"Sample IDs: {list(common_ids)[:5]}")

**Note**: Annotations are only needed for:
1. **Video labels** (required for Substep 2 classifier training)
2. GT boundaries (optional, for visualization/evaluation)

If you only want to output step embeddings without labels, you can skip loading annotations.

## 5. HiERO-based Step Localization

Use HiERO's hierarchical clustering mechanism to detect step boundaries.

In [None]:
from torch_geometric.data import Data, Batch
import torch.nn.functional as F

def process_video_with_hiero(features, model, device, n_clusters=None):
    """
    Process video features through HiERO model to detect step boundaries.
    
    Args:
        features: (T, 256) numpy array
        model: HiERO model
        device: torch device
        n_clusters: number of steps (None for auto)
    
    Returns:
        boundaries: list of (start, end) tuples
        hierarchical_features: processed features from HiERO
    """
    T, D = features.shape
    
    # Auto-estimate clusters
    if n_clusters is None:
        n_clusters = max(2, min(T // 30, 15))
    
    # Convert to torch tensor
    x = torch.from_numpy(features).float().to(device)  # (T, 256)
    
    # Create temporal graph (connect consecutive frames)
    edge_index = []
    for i in range(T - 1):
        edge_index.append([i, i + 1])
        edge_index.append([i + 1, i])  # Bidirectional
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().to(device)
    
    # Create PyG Data object
    graph_data = Data(x=x, edge_index=edge_index)
    # Add batch tensor manually (instead of using Batch.from_data_list)
    graph_data.batch = torch.zeros(T, dtype=torch.long, device=device)
    
    # Forward pass through HiERO
    with torch.no_grad():
        try:
            # HiERO forward pass
            output = model(graph_data)
            
            # Extract processed features
            if isinstance(output, dict):
                hierarchical_features = output.get('features', output.get('x', x))
            else:
                hierarchical_features = output
            
            hierarchical_features = hierarchical_features.cpu().numpy()
            
        except Exception as e:
            print(f"Warning: HiERO forward pass failed: {e}")
            print("Falling back to input features")
            hierarchical_features = features
    
    # Apply clustering on hierarchical features
    from sklearn.cluster import KMeans
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(hierarchical_features[:T])  # Ensure same length
    
    # Find boundaries where labels change
    boundaries = []
    current_start = 0
    current_label = labels[0]
    
    for i in range(1, len(labels)):
        if labels[i] != current_label:
            if i - current_start >= 5:  # Min segment length
                boundaries.append((current_start, i - 1))
                current_start = i
                current_label = labels[i]
    
    # Add last segment
    if len(labels) - current_start >= 5:
        boundaries.append((current_start, len(labels) - 1))
    
    return boundaries, hierarchical_features

print("‚úÖ HiERO processing function defined")

### 5.1 Test on One Video

In [None]:
# Test on one video
test_id = list(common_ids)[0]
print(f"Testing HiERO on: {test_id}")

# Load features
feature_files = [f for f in os.listdir(EGOVLP_FEATURES_DIR) 
                 if f.startswith(test_id.replace('_', '_')) and f.endswith('.npz')]

if feature_files:
    feature_path = os.path.join(EGOVLP_FEATURES_DIR, feature_files[0])
    data = np.load(feature_path)
    features = data['arr_0']  # (T, 256)
    
    print(f"Features shape: {features.shape}")
    print(f"Duration: {features.shape[0]} seconds\n")
    
    # Get GT for comparison
    gt_steps = annotations[test_id].get('steps', [])
    print(f"Ground truth: {len(gt_steps)} steps")
    
    # Process with HiERO
    print("\nProcessing with HiERO model...")
    pred_boundaries, hiero_features = process_video_with_hiero(
        features, model, device
    )
    
    print(f"\n‚úÖ HiERO detected: {len(pred_boundaries)} steps")
    for i, (start, end) in enumerate(pred_boundaries[:10]):
        print(f"  Step {i+1}: {start}s - {end}s ({end-start+1} frames)")
    
    print(f"\nHierarchical features shape: {hiero_features.shape}")
else:
    print(f"No features found for {test_id}")

### 5.2 Visualize Results

In [None]:
import matplotlib.pyplot as plt

def compare_features_and_boundaries(original_features, hiero_features, 
                                   gt_boundaries, pred_boundaries, title=""):
    """
    Visualize original vs HiERO-processed features and boundaries.
    """
    fig, axes = plt.subplots(3, 1, figsize=(15, 9), sharex=True)
    
    T = original_features.shape[0]
    time = np.arange(T)
    
    # Original features
    orig_norm = np.linalg.norm(original_features, axis=1)
    axes[0].plot(time, orig_norm, alpha=0.7, label='Original EgoVLP')
    axes[0].set_ylabel('Feature magnitude')
    axes[0].set_title(f'{title} - Original EgoVLP Features')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # HiERO features
    hiero_norm = np.linalg.norm(hiero_features[:T], axis=1)
    axes[1].plot(time, hiero_norm, alpha=0.7, color='orange', label='HiERO processed')
    axes[1].set_ylabel('Feature magnitude')
    axes[1].set_title('HiERO Hierarchical Features')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    # Boundaries comparison
    # GT boundaries
    for start, end in gt_boundaries:
        axes[2].axvspan(start, end, alpha=0.15, color='green')
        axes[2].axvline(start, color='green', linestyle='--', alpha=0.4, 
                       label='GT' if start == gt_boundaries[0][0] else '')
    
    # HiERO predictions
    for start, end in pred_boundaries:
        axes[2].axvline(start, color='red', linestyle='-', alpha=0.6, linewidth=2,
                       label='HiERO' if start == pred_boundaries[0][0] else '')
    
    axes[2].plot(time, hiero_norm, alpha=0.4, color='gray')
    axes[2].set_xlabel('Time (seconds)')
    axes[2].set_ylabel('Feature magnitude')
    axes[2].set_title('Boundary Comparison: GT (green) vs HiERO (red)')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Visualize
gt_frame_boundaries = [(int(step.get('start_time', 0)), int(step.get('end_time', 0))) 
                       for step in gt_steps]

compare_features_and_boundaries(
    features, hiero_features, 
    gt_frame_boundaries, pred_boundaries, 
    title=test_id
)

## 6. Process All Videos

In [None]:
from tqdm import tqdm

def process_all_videos_hiero(video_ids, model, device):
    """
    Process all videos with HiERO model.
    """
    results = {
        'boundaries': {},
        'step_embeddings': {},
        'video_labels': {},
        'activity_names': {},
    }
    
    for recording_id in tqdm(video_ids, desc="Processing with HiERO"):
        try:
            # Find feature file
            feature_files = [f for f in os.listdir(EGOVLP_FEATURES_DIR) 
                           if f.startswith(recording_id.replace('_', '_')) and f.endswith('.npz')]
            
            if not feature_files:
                continue
            
            # Load features
            feature_path = os.path.join(EGOVLP_FEATURES_DIR, feature_files[0])
            data = np.load(feature_path)
            features = data['arr_0']  # (T, 256)
            
            # Process with HiERO
            boundaries, hiero_features = process_video_with_hiero(
                features, model, device
            )
            
            # Extract step embeddings using HiERO features
            step_embeddings = []
            for start, end in boundaries:
                # Use HiERO-processed features for embeddings
                step_feat = hiero_features[start:end+1].mean(axis=0)  # (256,)
                step_embeddings.append(step_feat)
            
            step_embeddings = np.stack(step_embeddings, axis=0)  # (num_steps, 256)
            
            # Get video label
            anno = annotations[recording_id]
            has_errors = any(step.get('has_errors', False) for step in anno.get('steps', []))
            video_label = 1 if has_errors else 0
            
            # Store results
            results['boundaries'][recording_id] = boundaries
            results['step_embeddings'][recording_id] = step_embeddings
            results['video_labels'][recording_id] = video_label
            results['activity_names'][recording_id] = anno.get('activity_name', 'unknown')
            
        except Exception as e:
            print(f"Error processing {recording_id}: {e}")
            continue
    
    return results

# Process all videos
print(f"Processing {len(common_ids)} videos with HiERO model...")
all_results = process_all_videos_hiero(list(common_ids), model, device)

print(f"\n‚úÖ Successfully processed {len(all_results['boundaries'])} videos")

## 7. Statistics & Analysis

In [None]:
# Statistics
num_steps_list = [len(bounds) for bounds in all_results['boundaries'].values()]
labels = list(all_results['video_labels'].values())

print("=== HiERO-based Substep 1 Results ===")
print(f"Total videos processed: {len(all_results['boundaries'])}")
print(f"\nVideo labels distribution:")
print(f"  - Correct executions (label=0): {labels.count(0)} ({labels.count(0)/len(labels)*100:.1f}%)")
print(f"  - Has errors (label=1): {labels.count(1)} ({labels.count(1)/len(labels)*100:.1f}%)")
print(f"\nStep detection statistics:")
print(f"  - Avg steps per video: {np.mean(num_steps_list):.1f}")
print(f"  - Min/Max steps: {min(num_steps_list)} / {max(num_steps_list)}")
print(f"  - Median steps: {np.median(num_steps_list):.0f}")

In [None]:
# Visualize distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(num_steps_list, bins=20, edgecolor='black', alpha=0.7, color='orange')
axes[0].set_xlabel('Number of steps')
axes[0].set_ylabel('Number of videos')
axes[0].set_title('HiERO: Distribution of Steps per Video')
axes[0].grid(True, alpha=0.3)

label_counts = [labels.count(0), labels.count(1)]
axes[1].bar(['Correct (0)', 'Has Errors (1)'], label_counts, color=['green', 'red'], alpha=0.7)
axes[1].set_ylabel('Number of videos')
axes[1].set_title('Video Label Distribution')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 8. Prepare Data for Teammates (Substeps 2, 3, 4)

In [None]:
# Prepare padded embeddings for Substep 2
max_steps = max(len(bounds) for bounds in all_results['boundaries'].values())
print(f"Max steps in dataset: {max_steps}")

# Create padded arrays
recording_ids = list(all_results['step_embeddings'].keys())
n_videos = len(recording_ids)

padded_embeddings = np.zeros((n_videos, max_steps, 256), dtype=np.float32)
masks = np.zeros((n_videos, max_steps), dtype=bool)
video_labels = np.zeros(n_videos, dtype=np.int64)

for i, rec_id in enumerate(recording_ids):
    embeddings = all_results['step_embeddings'][rec_id]
    num_steps = len(embeddings)
    
    padded_embeddings[i, :num_steps] = embeddings
    masks[i, :num_steps] = True
    video_labels[i] = all_results['video_labels'][rec_id]

print(f"\n=== Data for Substep 2 (HiERO-based) ===")
print(f"Embeddings shape: {padded_embeddings.shape}")
print(f"Masks shape: {masks.shape}")
print(f"Labels shape: {video_labels.shape}")

In [None]:
# Save for Substep 2
substep2_output = os.path.join(OUTPUT_DIR, "hiero_substep2_data.npz")
np.savez(
    substep2_output,
    embeddings=padded_embeddings,
    masks=masks,
    labels=video_labels,
    recording_ids=np.array(recording_ids, dtype=object),
    max_steps=max_steps
)
print(f"‚úÖ Saved: {substep2_output}")
print(f"   Size: {os.path.getsize(substep2_output) / (1024*1024):.2f} MB")

In [None]:
# Save boundaries for Substeps 3 & 4
substep3_output = os.path.join(OUTPUT_DIR, "hiero_step_boundaries.json")

boundaries_json = {
    rec_id: {
        'boundaries': [(int(s), int(e)) for s, e in bounds],
        'num_steps': len(bounds),
        'video_label': int(all_results['video_labels'][rec_id]),
        'activity': all_results['activity_names'][rec_id]
    }
    for rec_id, bounds in all_results['boundaries'].items()
}

with open(substep3_output, 'w') as f:
    json.dump(boundaries_json, f, indent=2)

print(f"‚úÖ Saved: {substep3_output}")

In [None]:
# Save step embeddings
substep3_embeddings = os.path.join(OUTPUT_DIR, "hiero_step_embeddings.npz")

np.savez(
    substep3_embeddings,
    **{rec_id: emb for rec_id, emb in all_results['step_embeddings'].items()}
)

print(f"‚úÖ Saved: {substep3_embeddings}")

## 9. Final Summary

In [None]:
print("="*60)
print("EXTENSION SUBSTEP 1 COMPLETE (HiERO-based)!")
print("="*60)
print(f"\nüìä Processing Summary:")
print(f"   - Method: HiERO hierarchical model")
print(f"   - Videos processed: {len(all_results['boundaries'])}")
print(f"   - Total steps detected: {sum(num_steps_list)}")
print(f"   - Avg steps per video: {np.mean(num_steps_list):.1f}")

print(f"\nüìÅ Output files in: {OUTPUT_DIR}")
for fname in ['hiero_substep2_data.npz', 'hiero_step_boundaries.json', 'hiero_step_embeddings.npz']:
    fpath = os.path.join(OUTPUT_DIR, fname)
    if os.path.exists(fpath):
        size_mb = os.path.getsize(fpath) / (1024*1024)
        print(f"   ‚úÖ {fname} ({size_mb:.2f} MB)")

print(f"\nüéØ Deliverables ready for Substeps 2, 3, 4!")
print(f"\nüí° Advantages of HiERO-based approach:")
print(f"   - Hierarchical understanding of recipe steps")
print(f"   - Temporal context modeling via Graph U-Net")
print(f"   - Features trained on video-text alignment")
print(f"   - Better semantic grouping of actions")