<a href="https://colab.research.google.com/github/Sa74ll/ELM_challenge/blob/main/02_eval_offline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content
!git clone https://github.com/huggingface/lerobot.git
%cd /content/lerobot
!pip install -e .


In [None]:
!huggingface-cli login

In [None]:
%cd /content/lerobot
!pip install -e ".[smolvla]"

In [None]:
from pathlib import Path
import torch
from torch.utils.data import DataLoader, Subset
import numpy as np

from lerobot.datasets.lerobot_dataset import LeRobotDataset, LeRobotDatasetMetadata
from lerobot.policies.smolvla.modeling_smolvla import SmolVLAPolicy
from lerobot.policies.factory import make_pre_post_processors

# CONFIG
DATASET_REPO = "lerobot/svla_so101_pickplace"
BATCH_SIZE = 24
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Load policy

In [None]:
#load policy from Hugging face + dataset stats

policy = SmolVLAPolicy.from_pretrained("Sa74ll/smolvla_so101_color_aug_best_model1") #load best policy that saved in HF
policy.to(DEVICE).eval()

meta = LeRobotDatasetMetadata(DATASET_REPO)
preprocessor, _ = make_pre_post_processors(policy.config, dataset_stats=meta.stats)

fps = meta.fps
chunk_size = policy.config.chunk_size
print("chunk_size:", chunk_size)
print("fps:", fps)


Loading  HuggingFaceTB/SmolVLM2-500M-Video-Instruct weights ...
Reducing the number of VLM layers to 16 ...
chunk_size: 50
fps: 30


In [None]:
action_stats = meta.stats["action"]
action_min = torch.tensor(action_stats["min"])
action_max = torch.tensor(action_stats["max"])
action_mean = torch.tensor(action_stats["mean"])
action_std  = torch.tensor(action_stats["std"])


In [None]:

delta_timestamps = {              # build timestamps to match model
    "observation.state": [0.0],
    "observation.images.up": [0.0],
    "observation.images.side": [0.0],
    "action": [i / fps for i in range(chunk_size)], #SmolVLA predicts 50 actions (chunk_size), 50 action timestamps from the dataset to match model shape by dividing each action into frame per second (FPS)
}

In [None]:
# build val split (episodes >= 40)
base_ds = LeRobotDataset(DATASET_REPO, video_backend="pyav") #load dateset first without delta_timestamps to get the episode indices
episode_idx = np.array(base_ds.hf_dataset["episode_index"])
val_indices = [i for i, ep in enumerate(episode_idx) if ep >= 40]

val_full = LeRobotDataset(
    DATASET_REPO,
    delta_timestamps=delta_timestamps,
    video_backend="pyav",
)
val_ds = Subset(val_full, val_indices)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
print(f"val samples: {len(val_ds)}")


val samples: 2759


In [None]:
# helpers
def fix_keys(batch):
    """ remap camers name """
    if "observation.images.up" in batch:
        batch["observation.images.camera1"] = batch.pop("observation.images.up")
    if "observation.images.side" in batch:
        batch["observation.images.camera2"] = batch.pop("observation.images.side")
    return batch

def unnormalize_pred(pred_norm: torch.Tensor) -> torch.Tensor:
    """Model outputs normalized actions (because training loss was on normalized), so map back to dataset space using mean/std from meta.stats"""
    return pred_norm * action_std + action_mean



In [None]:
# eval
all_preds_raw = []
all_gts_raw = []

with torch.no_grad():
    for i, raw in enumerate(val_loader):
        """ ground truth in RAW space (dataset space)"""
        gt_raw = raw["action"][:, 0, :].clone()     # instead of getting all the 50 action (B,T,6), we just get the first action (B,6)

        raw = fix_keys(raw)
        batch = preprocessor(raw)

        for k, v in list(batch.items()):
            if torch.is_tensor(v):
                batch[k] = v.to(DEVICE)

        # model inference to normalised action sequence
        pred_seq = policy.predict_action_chunk(batch)    # (B, 50, 6)
        pred_step0 = pred_seq[:, 0, :].cpu()             # (B,6) normalized

        # bring prediction back to RAW space
        pred_raw = unnormalize_pred(pred_step0)          # (B,6)
        # append the preds and gts to lists
        all_preds_raw.append(pred_raw)
        all_gts_raw.append(gt_raw)


all_preds_raw = torch.cat(all_preds_raw, dim=0)
all_gts_raw   = torch.cat(all_gts_raw, dim=0)

print("Collected preds:", all_preds_raw.shape, "Ground truths:", all_gts_raw.shape)




Collected preds: torch.Size([2759, 6]) Ground truths: torch.Size([2759, 6])


In [None]:
# Metrics: per-joint 5% of its own range

joint_ranges = action_max - action_min
tol = joint_ranges * 0.05     # 5% per joint

abs_err = torch.abs(all_preds_raw - all_gts_raw)   # (N,6)
within_5pr = abs_err <= tol                       # (True/False split)

per_joint_success = within_5pr.float().mean(dim=0) * 100.0
overall_mean = per_joint_success.mean().item()

print("\n========== EVAL (per-joint 5%) ==========")
for j, s in enumerate(per_joint_success):
    print(f"joint {j}: {s:.2f}%")

print(f"\nAverage per-joint success (5%): {overall_mean:.2f}%")
print("=========================================\n")


joint 0: 45.81%
joint 1: 47.88%
joint 2: 70.06%
joint 3: 77.46%
joint 4: 60.86%
joint 5: 63.43%

Average per-joint success (5%): 60.92%

