# HiERO Step Localization (Clean Version)

**Function**: Use the HiERO model to perform step segmentation on EgoVLP features.
**Features**:
1. Fixed environment dependency conflicts (Hydra/NetworkX).
2. Used advanced Spectral Clustering algorithm.
3. Automatically handled PyTorch `weights_only` security error.

In [None]:
# 1. Environment Setup
import os

# Clone HiERO repository
if not os.path.exists('/content/HiERO'):
    !git clone https://github.com/T-Larm/HiERO_for_egovlp.git /content/HiERO

# Install core dependencies (using verified stable versions)
print("üì¶ Installing dependencies...")
!pip install -q hydra-core omegaconf einops torch_kmeans networkx

# Install PyTorch Geometric (auto-match current Colab Torch version)
!pip install -q torch-geometric
!pip install -q torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.5.1+cu124.html

print("‚úÖ Environment installation complete! Please proceed to the next step.")

In [None]:
# 2. Mount Drive and Path Configuration
import sys
from google.colab import drive

# Mount Google Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive', force_remount=True)

# Add HiERO to system path
if '/content/HiERO' not in sys.path:
    sys.path.insert(0, '/content/HiERO')

# ================= Path Configuration =================
# 1. Annotation file path (If in Drive, modify here; otherwise, auto-clone project)
PROJECT_ROOT = "/content/aml-2025-mistake-detection-gp"
if not os.path.exists(PROJECT_ROOT):
    !git clone https://github.com/T-Larm/aml-2025-mistake-detection-gp.git $PROJECT_ROOT

ANNOTATIONS_PATH = os.path.join(PROJECT_ROOT, "annotations/annotation_json/complete_step_annotations.json")

# 2. Feature and model paths (Based on your description)
EGOVLP_FEATURES_DIR = "/content/drive/MyDrive/AMLproject/our_features/gopro/segments/egovlp"
HIERO_CHECKPOINT = "/content/drive/MyDrive/AMLproject/Captain_Cook_dataset/hiero_egovlp/hiero_egovlp.pth"
OUTPUT_DIR = "/content/drive/MyDrive/AMLproject/extension1_outputs"

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"üìÇ Reading features: {EGOVLP_FEATURES_DIR}")
print(f"üìÇ Output results: {OUTPUT_DIR}")

In [None]:
# 3. Load HiERO Model (Fixed Version)
import torch
import yaml
import hydra
from models.hiero import HiERO

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if os.path.exists(HIERO_CHECKPOINT):
    print("üöÄ Loading model...")
    
    # Read Checkpoint (Disable weights_only security check to prevent errors)
    try:
        checkpoint = torch.load(HIERO_CHECKPOINT, map_location='cpu', weights_only=False)
    except TypeError:
        checkpoint = torch.load(HIERO_CHECKPOINT, map_location='cpu')
    
    # Process configuration
    if 'config' in checkpoint:
        config = checkpoint['config']
    else:
        config = {'model': {'conv': 'TDGC', 'hidden_size': 256, 'k': 2.0, 'n_layers': 2}}

    model_config = config.get('model', {}).copy()
    
    # Clean up Hydra residual parameters
    for key in ['_target_', '_recursive_', '_convert_']:
        model_config.pop(key, None)
    
    if isinstance(model_config.get('conv'), str):
        model_config['conv'] = {'name': model_config['conv']}

    # Initialize
    hiero_model = HiERO(input_size=256, **model_config)
    
    # Load weights
    state_dict = checkpoint.get('state_dict', checkpoint.get('model', checkpoint))
    new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
    hiero_model.load_state_dict(new_state_dict, strict=False)
    
    hiero_model = hiero_model.to(device).eval()
    print("‚úÖ HiERO model loaded successfully!")
else:
    print(f"‚ùå Model file not found: {HIERO_CHECKPOINT}")

In [None]:
# 4. Define core segmentation function (includes Spectral Clustering and post-processing)
from sklearn.cluster import SpectralClustering
from torch_geometric.data import Data
import numpy as np

def detect_steps_with_hiero(features, model, device, n_clusters='auto', use_spectral=True, stride_seconds=None, fps=30):
    """
    Args:
        stride_seconds: Feature extraction stride in seconds (e.g., 1.0 for 1s). If None, uses stride=16 frames.
        fps: Video frame rate (default: 30)
    """
    T, D = features.shape

    # 1. Auto-estimate number of clusters
    if n_clusters == 'auto':
        n_clusters = max(3, min(T // 30, 15))
        n_clusters = int(n_clusters)

    # 2. Construct graph data (with all required attributes for HiERO)
    x = torch.from_numpy(features).float().to(device)
    edge_index = []
    for i in range(T - 1):
        edge_index.append([i, i + 1])
        edge_index.append([i + 1, i])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous().to(device)
    
    # Calculate stride_frames based on metadata or default
    if stride_seconds is not None:
        # Use stride from file metadata (e.g., 1s_1s.npz means 1 second stride)
        stride_frames = stride_seconds * fps
    else:
        # Default EgoVLP config: stride=16 frames
        stride_frames = 16
    
    # Create Data object with all attributes HiERO expects (following official implementation)
    graph_data = Data(x=x, edge_index=edge_index)
    graph_data.batch = torch.zeros(T, dtype=torch.long, device=device)
    graph_data.pos = ((0.5 + torch.arange(T, dtype=torch.float, device=device)) * stride_frames / fps).unsqueeze(1)  # Real timestamps in seconds
    graph_data.indices = torch.arange(T, dtype=torch.long, device=device)  # Frame indices
    graph_data.mask = torch.ones(T, dtype=torch.bool, device=device)  # Valid frame mask

    # 3. Model inference
    with torch.no_grad():
        try:
            output = model(graph_data)
            
            # HiERO returns Batch with multiple resolution graphs, take the first (highest resolution)
            if isinstance(output, torch.Tensor):
                hiero_features = output[:T]
            elif hasattr(output, '__len__') and len(output) > 0:
                # If output is a list/batch of Data objects, take the first one
                first_output = output[0] if isinstance(output, (list, tuple)) else output
                if isinstance(first_output, Data):
                    hiero_features = first_output.x[:T]
                else:
                    hiero_features = first_output[:T] if hasattr(first_output, '__getitem__') else output.x[:T]
            else:
                hiero_features = output.x[:T] if hasattr(output, 'x') else output[:T]
            
            hiero_features = hiero_features.cpu().numpy()
            
        except Exception as e:
            print(f"  ‚ö†Ô∏è HiERO inference failed: {type(e).__name__}: {str(e)}")
            print(f"     Using original features instead.")
            hiero_features = features

    # 4. Spectral Clustering
    features_norm = hiero_features / (np.linalg.norm(hiero_features, axis=1, keepdims=True) + 1e-8)
    affinity = features_norm @ features_norm.T
    
    # Ensure affinity is valid (no NaN/Inf)
    if np.isnan(affinity).any() or np.isinf(affinity).any():
        print(f"  ‚ö†Ô∏è Invalid affinity matrix, using identity")
        affinity = np.eye(len(features_norm))
    
    # Make sure affinity is symmetric and positive semi-definite
    affinity = (affinity + affinity.T) / 2
    affinity = np.clip(affinity, 0, 1)
    
    clustering = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', random_state=42, assign_labels='kmeans')
    labels = clustering.fit_predict(affinity)

    # 5. Convert to boundaries (with post-processing)
    boundaries = []
    current_start = 0
    for i in range(1, len(labels)):
        if labels[i] != labels[current_start]:
            if i - current_start >= 5: # Minimum length filtering
                boundaries.append((current_start, i - 1))
                current_start = i
    if len(labels) - current_start >= 5:
        boundaries.append((current_start, len(labels) - 1))
    
    # Extract Embedding
    step_embeddings = []
    for s, e in boundaries:
        step_embeddings.append(hiero_features[s:e+1].mean(axis=0))
    
    return boundaries, np.stack(step_embeddings) if step_embeddings else np.zeros((0, 256))

print("‚úÖ Core algorithm function defined")

In [None]:
# 5. Batch Process Videos (Main Loop) - with JSON + padded NPZ outputs
import json
import re
from tqdm import tqdm

# Prepare ID list
with open(ANNOTATIONS_PATH, 'r') as f:
    annotations = json.load(f)

available_features = {
    f.split('_')[0] + '_' + f.split('_')[1]
    for f in os.listdir(EGOVLP_FEATURES_DIR)
    if f.endswith('.npz')
}
common_ids = list(set(annotations.keys()).intersection(available_features))

print(f"üöÄ Starting processing {len(common_ids)} videos...")

hiero_results = {}
failed = []

for vid in tqdm(common_ids):
    try:
        # Load features
        f_name = [f for f in os.listdir(EGOVLP_FEATURES_DIR) if f.startswith(vid)][0]
        data = np.load(os.path.join(EGOVLP_FEATURES_DIR, f_name))
        feats = data['arr_0'] if 'arr_0' in data else data['features']

        # Clean NaNs
        if np.isnan(feats).any():
            feats = np.nan_to_num(feats)

        # Parse stride from filename (e.g., "xxx_1s_1s.npz" means stride=1s)
        stride_match = re.search(r'_(\d+\.?\d*)s_', f_name)
        stride_seconds = float(stride_match.group(1)) if stride_match else None

        # Run HiERO
        bounds, embs = detect_steps_with_hiero(
            feats, hiero_model, device,
            stride_seconds=stride_seconds, fps=30
        )

        # Convert outputs to JSON/NP friendly formats
        # embs is already numpy array from detect_steps_with_hiero
        embs = np.asarray(embs, dtype=np.float32)
        
        # bounds is already a Python list of tuples [(start, end), ...]
        # Make sure each tuple is converted to list for JSON serialization
        bounds = [list(b) if isinstance(b, tuple) else b for b in bounds]

        # Record results
        anno = annotations[vid]
        has_error = any(s.get('has_errors', False) for s in anno.get('steps', []))

        hiero_results[vid] = {
            'boundaries': bounds,
            'step_embeddings': embs,  # keep raw per-video (variable length)
            'video_label': 1 if has_error else 0,
            'activity': anno.get('activity_name', 'unknown')
        }

    except Exception as e:
        failed.append(vid)

print(f"\n‚úÖ Processing complete! Success: {len(hiero_results)}, Failed: {len(failed)}")

# ---------------------------
# A) Save raw dict NPZ (debug-friendly, keeps variable length per video)
# ---------------------------
raw_npz_path = os.path.join(OUTPUT_DIR, 'hiero_step_embeddings_raw.npz')
np.savez(raw_npz_path, results=hiero_results, failed=np.array(failed, dtype=object))
print(f"üíæ Raw results saved to: {raw_npz_path}")

# ---------------------------
# B) Save boundaries JSON (what you said you need)
# ---------------------------
bound_json = {}
for vid, item in hiero_results.items():
    bound_json[vid] = {
        "boundaries": item["boundaries"],
        "video_label": item["video_label"],
        "activity": item["activity"],
    }

json_path = os.path.join(OUTPUT_DIR, "hiero_step_boundaries.json")
with open(json_path, "w") as f:
    json.dump(bound_json, f, indent=2)
print(f"üíæ Boundaries JSON saved to: {json_path}")

# ---------------------------
# C) Save padded embeddings NPZ (model-friendly fixed tensor + mask)
# ---------------------------
vids = list(hiero_results.keys())
if len(vids) == 0:
    raise RuntimeError("No successful videos to save. Check 'failed' list and upstream errors.")

# Determine emb_dim and max_steps across videos
emb_dim = None
max_steps = 0
for vid in vids:
    embs = hiero_results[vid]["step_embeddings"]
    embs = np.asarray(embs, dtype=np.float32)
    if embs.ndim == 1:
        embs = embs.reshape(1, -1)
    if emb_dim is None:
        emb_dim = embs.shape[-1]
    max_steps = max(max_steps, embs.shape[0])

N = len(vids)
hiero_emb = np.zeros((N, max_steps, emb_dim), dtype=np.float32)
hiero_mask = np.zeros((N, max_steps), dtype=np.bool_)
labels = np.zeros((N,), dtype=np.int64)

for i, vid in enumerate(vids):
    embs = np.asarray(hiero_results[vid]["step_embeddings"], dtype=np.float32)
    if embs.ndim == 1:
        embs = embs.reshape(1, -1)
    k = embs.shape[0]
    hiero_emb[i, :k, :] = embs
    hiero_mask[i, :k] = True
    labels[i] = int(hiero_results[vid]["video_label"])

npz_path = os.path.join(OUTPUT_DIR, "hiero_step_embeddings.npz")
np.savez(
    npz_path,
    video_ids=np.array(vids, dtype=object),
    step_embeddings=hiero_emb,
    step_mask=hiero_mask,
    labels=labels,
)
print(f"üíæ Padded embeddings NPZ saved to: {npz_path}")
print(f"üìê Shapes: step_embeddings={hiero_emb.shape}, step_mask={hiero_mask.shape}, labels={labels.shape}")
