## 1. Setup Environment

In [None]:
# Mount Google Drive (if your data is on Drive)
from google.colab import drive
drive.mount('/content/drive')

# Optional: Change working directory to your project folder
# import os
# os.chdir('/content/drive/MyDrive/aml-2025-mistake-detection-gp')

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Clone Repository and Install Dependencies

In [None]:
# Clone your repository (if not already done)
# Uncomment if you need to clone
# !git clone https://github.com/your-username/aml-2025-mistake-detection-gp.git
# %cd aml-2025-mistake-detection-gp

# Or if working from Drive, just cd to the directory
# %cd /content/drive/MyDrive/aml-2025-mistake-detection-gp

In [None]:
# Install dependencies
!pip install -q torch torchvision torchaudio
!pip install -q numpy pandas matplotlib seaborn
!pip install -q scikit-learn tqdm

print("✅ Dependencies installed!")

## 3. Import Libraries and Define Helper Functions

In [None]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Subset, Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from tqdm.notebook import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import pandas as pd

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✅ Libraries imported successfully!")

In [None]:
# Import project modules
# Make sure you're in the project directory
import sys
sys.path.append('.')

from extension.step_localization import StepLocalizer, prepare_dataset_for_task_verification
from dataloader.TaskVerificationDataset import TaskVerificationDataset
from core.models.task_verifier import TaskVerifier, SimpleMLPVerifier

print("✅ Project modules imported!")

## 4. Configuration

In [None]:
# Configuration
CONFIG = {
    # Paths
    'features_dir': 'egovlp',
    'annotations_file': 'annotations/annotation_json/step_annotations.json',
    'split_file': 'er_annotations/recordings_combined_splits.json',
    'split': 'train',  # 'train', 'val', or 'test'
    
    # Model settings
    'model_type': 'transformer',  # 'transformer' or 'mlp'
    'embedding_dim': 1024,
    'hidden_dim': 512,
    'num_heads': 8,
    'num_layers': 1,
    'dropout': 0.3,
    
    # Training settings
    'num_epochs': 50,
    'batch_size': 8,
    'learning_rate': 1e-4,
    'weight_decay': 1e-5,
    'patience': 10,  # Early stopping patience
    
    # Device
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    
    # Results
    'save_dir': 'results/task_verification',
}

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key:20s}: {value}")

## 5. Data Preparation

In [None]:
# Initialize step localizer
print("Initializing step localizer...")
localizer = StepLocalizer(
    annotations_file=CONFIG['annotations_file'],
    features_dir=CONFIG['features_dir']
)

# Prepare dataset
print(f"\nPreparing dataset from '{CONFIG['split']}' split...")
data_dict = prepare_dataset_for_task_verification(
    localizer=localizer,
    split_file=CONFIG['split_file'],
    split=CONFIG['split']
)

print(f"\n✅ Data loaded successfully!")
print(f"   Shape: {data_dict['embeddings'].shape}")
print(f"   Samples: {len(data_dict['recording_ids'])}")

In [None]:
# Create full dataset
full_dataset = TaskVerificationDataset(
    data_dict['embeddings'],
    data_dict['labels'],
    data_dict['masks'],
    data_dict['recording_ids']
)

# Print statistics
full_dataset.print_statistics()

In [None]:
# Visualize data distribution
stats = full_dataset.get_statistics()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Class distribution
ax = axes[0]
labels = ['No Errors', 'Has Errors']
counts = [stats['num_negative'], stats['num_positive']]
colors = ['#2ecc71', '#e74c3c']
ax.bar(labels, counts, color=colors, alpha=0.7, edgecolor='black')
ax.set_ylabel('Number of Videos', fontsize=12)
ax.set_title('Class Distribution', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
for i, (label, count) in enumerate(zip(labels, counts)):
    ax.text(i, count + 0.5, str(count), ha='center', fontsize=12, fontweight='bold')

# Steps per video distribution
ax = axes[1]
actual_steps = full_dataset.masks.sum(dim=1).numpy()
ax.hist(actual_steps, bins=range(int(actual_steps.min()), int(actual_steps.max())+2), 
        alpha=0.7, edgecolor='black', color='#3498db')
ax.set_xlabel('Number of Steps', fontsize=12)
ax.set_ylabel('Number of Videos', fontsize=12)
ax.set_title('Steps per Video Distribution', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
ax.axvline(actual_steps.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {actual_steps.mean():.1f}')
ax.legend()

plt.tight_layout()
plt.show()

## 6. Recipe Grouping for Leave-One-Out CV

In [None]:
def get_recipe_groups(recording_ids, annotations_file):
    """Group recordings by recipe (activity_id)."""
    with open(annotations_file, 'r') as f:
        annotations = json.load(f)
    
    recipe_groups = {}
    for rec_id in recording_ids:
        if rec_id in annotations:
            activity_id = annotations[rec_id]['activity_id']
            activity_name = annotations[rec_id].get('activity_name', activity_id)
            
            if activity_id not in recipe_groups:
                recipe_groups[activity_id] = {
                    'name': activity_name,
                    'recordings': []
                }
            recipe_groups[activity_id]['recordings'].append(rec_id)
    
    return recipe_groups

# Group recordings
recipe_groups = get_recipe_groups(data_dict['recording_ids'], CONFIG['annotations_file'])
recipe_ids = list(recipe_groups.keys())

print(f"\nFound {len(recipe_ids)} unique recipes:")
print("="*60)
for recipe_id in recipe_ids:
    info = recipe_groups[recipe_id]
    print(f"{recipe_id:15s} - {info['name']:30s} ({len(info['recordings'])} videos)")
print("="*60)

## 7. Training and Evaluation Functions

In [None]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch."""
    model.train()
    total_loss = 0
    num_batches = 0
    
    for batch in tqdm(dataloader, desc="Training", leave=False):
        embeddings = batch['embeddings'].to(device)
        labels = batch['label'].to(device)
        masks = batch['mask'].to(device)
        
        optimizer.zero_grad()
        outputs = model(embeddings, masks)
        loss = criterion(outputs, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    return total_loss / num_batches


def evaluate(model, dataloader, device):
    """Evaluate model."""
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch in dataloader:
            embeddings = batch['embeddings'].to(device)
            labels = batch['label'].to(device)
            masks = batch['mask'].to(device)
            
            outputs = model(embeddings, masks)
            probs = outputs.cpu().numpy()
            preds = (probs > 0.5).astype(int)
            labels_np = labels.cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels_np)
            all_probs.extend(probs)
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)
    
    metrics = {
        'accuracy': accuracy_score(all_labels, all_preds),
        'precision': precision_score(all_labels, all_preds, zero_division=0),
        'recall': recall_score(all_labels, all_preds, zero_division=0),
        'f1': f1_score(all_labels, all_preds, zero_division=0),
    }
    
    if len(np.unique(all_labels)) > 1:
        metrics['auc'] = roc_auc_score(all_labels, all_probs)
    else:
        metrics['auc'] = 0.0
    
    return metrics, all_preds, all_labels, all_probs

print("✅ Training functions defined!")

## 8. Leave-One-Recipe-Out Cross-Validation

In [None]:
# Storage for results
all_fold_results = []
fold_predictions = {}  # Store predictions for later analysis

device = CONFIG['device']
print(f"\n{'='*60}")
print(f"Starting Leave-One-Recipe-Out Cross-Validation")
print(f"{'='*60}")
print(f"Total recipes: {len(recipe_ids)}")
print(f"Device: {device}")
print(f"Model: {CONFIG['model_type']}")
print(f"{'='*60}\n")

# Loop through each recipe as test set
for fold_idx, test_recipe in enumerate(recipe_ids):
    print(f"\n{'='*60}")
    print(f"Fold {fold_idx + 1}/{len(recipe_ids)}")
    print(f"{'='*60}")
    print(f"Test recipe: {test_recipe} ({recipe_groups[test_recipe]['name']})")
    
    # Split data
    test_recordings = recipe_groups[test_recipe]['recordings']
    train_recordings = []
    for recipe in recipe_ids:
        if recipe != test_recipe:
            train_recordings.extend(recipe_groups[recipe]['recordings'])
    
    train_indices = [i for i, rid in enumerate(data_dict['recording_ids']) 
                    if rid in train_recordings]
    test_indices = [i for i, rid in enumerate(data_dict['recording_ids']) 
                   if rid in test_recordings]
    
    print(f"Train samples: {len(train_indices)}, Test samples: {len(test_indices)}")
    
    if len(test_indices) == 0:
        print("⚠️  Empty test set, skipping fold")
        continue
    
    # Create data loaders
    train_subset = Subset(full_dataset, train_indices)
    test_subset = Subset(full_dataset, test_indices)
    
    train_loader = DataLoader(train_subset, batch_size=CONFIG['batch_size'], shuffle=True)
    test_loader = DataLoader(test_subset, batch_size=CONFIG['batch_size'], shuffle=False)
    
    # Initialize model
    if CONFIG['model_type'] == 'transformer':
        model = TaskVerifier(
            embedding_dim=CONFIG['embedding_dim'],
            hidden_dim=CONFIG['hidden_dim'],
            num_heads=CONFIG['num_heads'],
            num_layers=CONFIG['num_layers'],
            dropout=CONFIG['dropout']
        ).to(device)
    else:
        model = SimpleMLPVerifier(
            embedding_dim=CONFIG['embedding_dim'],
            hidden_dim=CONFIG['hidden_dim'],
            dropout=CONFIG['dropout']
        ).to(device)
    
    print(f"Model parameters: {model.get_num_parameters():,}")
    
    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), 
                                  lr=CONFIG['learning_rate'], 
                                  weight_decay=CONFIG['weight_decay'])
    
    # Training loop
    best_test_f1 = 0
    best_metrics = None
    patience_counter = 0
    
    training_history = {'loss': [], 'test_f1': [], 'test_auc': []}
    
    for epoch in range(CONFIG['num_epochs']):
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        training_history['loss'].append(train_loss)
        
        # Evaluate every 5 epochs
        if (epoch + 1) % 5 == 0:
            test_metrics, preds, labels, probs = evaluate(model, test_loader, device)
            training_history['test_f1'].append(test_metrics['f1'])
            training_history['test_auc'].append(test_metrics['auc'])
            
            print(f"Epoch {epoch+1:3d}/{CONFIG['num_epochs']} - "
                  f"Loss: {train_loss:.4f} - "
                  f"F1: {test_metrics['f1']:.4f} - "
                  f"AUC: {test_metrics['auc']:.4f}")
            
            if test_metrics['f1'] > best_test_f1:
                best_test_f1 = test_metrics['f1']
                best_metrics = test_metrics
                best_preds = preds
                best_labels = labels
                best_probs = probs
                patience_counter = 0
            else:
                patience_counter += 1
            
            if patience_counter >= CONFIG['patience']:
                print(f"⚠️  Early stopping at epoch {epoch+1}")
                break
    
    # Store results
    if best_metrics is None:
        best_metrics, best_preds, best_labels, best_probs = evaluate(model, test_loader, device)
    
    print(f"\n{'='*40}")
    print(f"Fold {fold_idx + 1} Best Results:")
    print(f"{'='*40}")
    for metric, value in best_metrics.items():
        print(f"  {metric:12s}: {value:.4f}")
    print(f"{'='*40}")
    
    fold_result = {
        'fold': fold_idx + 1,
        'test_recipe': test_recipe,
        'test_recipe_name': recipe_groups[test_recipe]['name'],
        'num_train': len(train_indices),
        'num_test': len(test_indices),
        'metrics': best_metrics,
        'training_history': training_history
    }
    all_fold_results.append(fold_result)
    
    fold_predictions[test_recipe] = {
        'predictions': best_preds,
        'labels': best_labels,
        'probabilities': best_probs,
        'recording_ids': test_recordings
    }

print(f"\n\n{'='*60}")
print("✅ Cross-Validation Complete!")
print(f"{'='*60}")

## 9. Results Analysis

In [None]:
# Aggregate results
print(f"\n{'='*60}")
print("LEAVE-ONE-RECIPE-OUT CROSS-VALIDATION RESULTS")
print(f"{'='*60}")
print(f"Completed folds: {len(all_fold_results)}/{len(recipe_ids)}\n")

avg_metrics = {}
for metric in all_fold_results[0]['metrics'].keys():
    values = [fold['metrics'][metric] for fold in all_fold_results]
    avg_metrics[metric] = {
        'mean': np.mean(values),
        'std': np.std(values),
        'min': np.min(values),
        'max': np.max(values),
        'values': values
    }
    
    print(f"{metric.upper():12s}: "
          f"{avg_metrics[metric]['mean']:.4f} ± {avg_metrics[metric]['std']:.4f} "
          f"(min: {avg_metrics[metric]['min']:.4f}, max: {avg_metrics[metric]['max']:.4f})")

print(f"{'='*60}\n")

In [None]:
# Visualize results across folds
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Metrics box plot
ax = axes[0, 0]
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1', 'auc']
data_to_plot = [avg_metrics[m]['values'] for m in metrics_to_plot]
bp = ax.boxplot(data_to_plot, labels=[m.upper() for m in metrics_to_plot], 
                patch_artist=True, showmeans=True)
for patch in bp['boxes']:
    patch.set_facecolor('#3498db')
    patch.set_alpha(0.6)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Metrics Distribution Across Folds', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1])

# 2. Per-recipe F1 scores
ax = axes[0, 1]
recipe_names = [fold['test_recipe_name'][:20] for fold in all_fold_results]
f1_scores = [fold['metrics']['f1'] for fold in all_fold_results]
colors_bar = ['#e74c3c' if f1 < 0.5 else '#f39c12' if f1 < 0.7 else '#2ecc71' for f1 in f1_scores]
bars = ax.barh(recipe_names, f1_scores, color=colors_bar, alpha=0.7, edgecolor='black')
ax.set_xlabel('F1 Score', fontsize=12)
ax.set_title('F1 Score per Recipe', fontsize=14, fontweight='bold')
ax.axvline(avg_metrics['f1']['mean'], color='blue', linestyle='--', linewidth=2, label='Mean F1')
ax.legend()
ax.grid(axis='x', alpha=0.3)

# 3. Training curves (first fold as example)
ax = axes[1, 0]
if len(all_fold_results) > 0:
    history = all_fold_results[0]['training_history']
    epochs_eval = list(range(5, len(history['loss']) + 1, 5))
    ax.plot(range(1, len(history['loss']) + 1), history['loss'], 
            'b-', linewidth=2, label='Training Loss', alpha=0.7)
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12, color='b')
    ax.tick_params(axis='y', labelcolor='b')
    ax.grid(alpha=0.3)
    
    ax2 = ax.twinx()
    ax2.plot(epochs_eval, history['test_f1'], 'r-o', linewidth=2, 
             label='Test F1', markersize=6, alpha=0.7)
    ax2.set_ylabel('F1 Score', fontsize=12, color='r')
    ax2.tick_params(axis='y', labelcolor='r')
    ax.set_title(f"Training Curve (Fold 1: {all_fold_results[0]['test_recipe_name']})", 
                 fontsize=14, fontweight='bold')
    ax.legend(loc='upper left')
    ax2.legend(loc='upper right')

# 4. Confusion matrix (aggregated)
ax = axes[1, 1]
all_preds_agg = []
all_labels_agg = []
for recipe, pred_data in fold_predictions.items():
    all_preds_agg.extend(pred_data['predictions'])
    all_labels_agg.extend(pred_data['labels'])

cm = confusion_matrix(all_labels_agg, all_preds_agg)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
            xticklabels=['No Errors', 'Has Errors'],
            yticklabels=['No Errors', 'Has Errors'],
            cbar_kws={'label': 'Count'})
ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('Actual', fontsize=12)
ax.set_title('Aggregated Confusion Matrix', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Create detailed results table
results_data = []
for fold in all_fold_results:
    results_data.append({
        'Fold': fold['fold'],
        'Recipe': fold['test_recipe_name'][:25],
        'Train': fold['num_train'],
        'Test': fold['num_test'],
        'Accuracy': f"{fold['metrics']['accuracy']:.4f}",
        'Precision': f"{fold['metrics']['precision']:.4f}",
        'Recall': f"{fold['metrics']['recall']:.4f}",
        'F1': f"{fold['metrics']['f1']:.4f}",
        'AUC': f"{fold['metrics']['auc']:.4f}",
    })

df_results = pd.DataFrame(results_data)
print("\nDetailed Results per Fold:")
print(df_results.to_string(index=False))

# Summary statistics
print("\n" + "="*80)
print("Summary Statistics:")
print("="*80)
summary_data = []
for metric in ['accuracy', 'precision', 'recall', 'f1', 'auc']:
    summary_data.append({
        'Metric': metric.upper(),
        'Mean': f"{avg_metrics[metric]['mean']:.4f}",
        'Std': f"{avg_metrics[metric]['std']:.4f}",
        'Min': f"{avg_metrics[metric]['min']:.4f}",
        'Max': f"{avg_metrics[metric]['max']:.4f}",
    })
df_summary = pd.DataFrame(summary_data)
print(df_summary.to_string(index=False))
print("="*80)

## 10. Save Results

In [None]:
# Save results to JSON
os.makedirs(CONFIG['save_dir'], exist_ok=True)

results = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'configuration': CONFIG,
    'average_metrics': {k: {key: val for key, val in v.items() if key != 'values'} 
                       for k, v in avg_metrics.items()},
    'fold_results': all_fold_results
}

results_file = os.path.join(
    CONFIG['save_dir'], 
    f"loro_cv_{CONFIG['model_type']}_{CONFIG['split']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
)

with open(results_file, 'w') as f:
    json.dump(results, f, indent=2)

print(f"✅ Results saved to: {results_file}")

# Save results table as CSV
csv_file = results_file.replace('.json', '.csv')
df_results.to_csv(csv_file, index=False)
print(f"✅ Results table saved to: {csv_file}")

## 11. Conclusion

### Key Findings

Run this cell to see a summary of your results:

In [None]:
print("\n" + "="*80)
print("TASK VERIFICATION - FINAL SUMMARY")
print("="*80)
print(f"\nModel: {CONFIG['model_type'].upper()}")
print(f"Total Recipes: {len(recipe_ids)}")
print(f"Total Videos: {len(data_dict['recording_ids'])}")
print(f"\nOverall Performance:")
print(f"  F1 Score:  {avg_metrics['f1']['mean']:.4f} ± {avg_metrics['f1']['std']:.4f}")
print(f"  AUC Score: {avg_metrics['auc']['mean']:.4f} ± {avg_metrics['auc']['std']:.4f}")
print(f"  Accuracy:  {avg_metrics['accuracy']['mean']:.4f} ± {avg_metrics['accuracy']['std']:.4f}")

# Identify best and worst recipes
best_fold = max(all_fold_results, key=lambda x: x['metrics']['f1'])
worst_fold = min(all_fold_results, key=lambda x: x['metrics']['f1'])

print(f"\nBest Recipe:")
print(f"  {best_fold['test_recipe_name']} (F1: {best_fold['metrics']['f1']:.4f})")

print(f"\nWorst Recipe:")
print(f"  {worst_fold['test_recipe_name']} (F1: {worst_fold['metrics']['f1']:.4f})")

print(f"\n{'='*80}")
print("✅ Task Verification Training Complete!")
print("="*80)

## Next Steps

1. **Try different models**: Change `CONFIG['model_type']` to 'mlp' and rerun
2. **Hyperparameter tuning**: Adjust learning rate, batch size, hidden dimensions
3. **Feature analysis**: Investigate which steps contribute most to predictions
4. **Error analysis**: Examine misclassified videos
5. **Ensemble methods**: Combine transformer and MLP models
6. **Use validation split**: Try training on train+val splits together