In [1]:
import os, random, math, json, shutil
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import numpy as np
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

In [2]:
# Config & output directories
OUT_ROOT = "vla_benchmark_output"
DATA_DIR = os.path.join(OUT_ROOT, "dataset")
IMG_DIR = os.path.join(DATA_DIR, "images")
os.makedirs(IMG_DIR, exist_ok=True)
ARTIFACTS_DIR = os.path.join(OUT_ROOT, "artifacts")
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

IMG_SIZE = 128
ACTIONS = ["turn_left", "turn_right", "go_straight", "stop", "slow_down"]
DATASET_SIZE = 1000
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x20749973cf0>

In [3]:
# Text templates (diverse)
INSTR_TEMPLATES = {
    "turn_left": ["Turn left", "Take a left", "Left at the intersection", "Make a left turn soon"],
    "turn_right": ["Turn right", "Take a right", "Right at the intersection", "Make a right turn soon"],
    "go_straight": ["Go straight", "Keep going", "Continue forward", "Proceed straight ahead"],
    "stop": ["Stop", "Halt", "Bring vehicle to a stop", "Completely stop the vehicle"],
    "slow_down": ["Slow down", "Reduce speed", "Caution - slow down", "Decelerate"]
}

# Additional language variations for contradictory instructions
NEGATION_TEMPLATES = [
    "Ignore the sign, {}",
    "Despite the visual cue, {}",
    "Do not follow the sign, {}"
]

In [4]:
# Graphics helper functions
def random_color(minv=100, maxv=255):
    return tuple(np.random.randint(minv, maxv, size=3).tolist())

def make_background(w, h):
    # Gradient + slight road texture
    base = Image.new("RGB", (w,h), random_color(160,220))
    draw = ImageDraw.Draw(base)
    for y in range(h):
        c = int(160 + 60 * (y / h) + np.random.randint(-6,6))
        draw.line([(0,y),(w,y)], fill=(c,c,c))
    # Optional horizontal faint stripes to simulate road texture
    if random.random() < 0.6:
        for i in range(0, h, 12):
            draw.line([(0, i+np.random.randint(-2,2)), (w, i+np.random.randint(-2,2))], fill=(200,200,200,20))
    return base

def add_gaussian_noise(img, sigma=6):
    arr = np.array(img).astype(np.float32)
    noise = np.random.normal(0, sigma, arr.shape)
    arr = np.clip(arr + noise, 0, 255).astype(np.uint8)
    return Image.fromarray(arr)

def random_affine(img):
    # Apply small affine transform (shear/scale/translate)
    w,h = img.size
    max_shear = 0.15
    max_translate = 0.08
    sx = random.uniform(-max_shear, max_shear)
    sy = random.uniform(-max_shear, max_shear)
    tx = random.uniform(-max_translate, max_translate) * w
    ty = random.uniform(-max_translate, max_translate) * h
    # Matrix for PIL transform: (a, b, c, d, e, f)
    a = 1 + sx
    b = sy
    c = tx
    d = -sy
    e = 1 + sx
    f = ty
    return img.transform(img.size, Image.AFFINE, (a,b,c,d,e,f), resample=Image.BICUBIC)

# Draw arrow polygon (centered) and rotate/scale/paste
def draw_arrow(img, angle_deg, color=(10,10,10), thickness=None, length=None, center_jitter=8):
    w,h = img.size
    cx = w//2 + random.randint(-center_jitter, center_jitter)
    cy = h//2 + random.randint(-center_jitter, center_jitter)
    length = length or random.uniform(30,60)
    thickness = thickness or random.randint(5,12)
    arrow = Image.new("RGBA", img.size, (0,0,0,0))
    d = ImageDraw.Draw(arrow)
    # Draw shaft
    rad = math.radians(angle_deg)
    x2 = cx + length * math.cos(rad)
    y2 = cy + length * math.sin(rad)
    d.line([(cx,cy),(x2,y2)], fill=color+(255,), width=thickness)
    # Arrow head
    head_len = max(10, int(length * 0.25))
    left = (x2 - head_len*math.cos(rad-0.6), y2 - head_len*math.sin(rad-0.6))
    right = (x2 - head_len*math.cos(rad+0.6), y2 - head_len*math.sin(rad+0.6))
    d.polygon([ (x2,y2), left, right ], fill=color+(255,))
    arrow = arrow.rotate(random.uniform(-7,7), expand=False)
    img.paste(arrow, (0,0), arrow)

def draw_stop_sign(img):
    w,h = img.size
    cx = w//2 + random.randint(-10,10)
    cy = h//2 + random.randint(-10,10)
    r = random.randint(18,32)
    sign = Image.new("RGBA", img.size, (0,0,0,0))
    d = ImageDraw.Draw(sign)
    poly = []
    rot = random.random()*2*math.pi
    for i in range(8):
        ang = 2*math.pi*(i/8) + rot
        x = cx + r*math.cos(ang)
        y = cy + r*math.sin(ang)
        poly.append((x,y))
    d.polygon(poly, fill=(180+random.randint(-30,30), 0, 0, 255))
    # STOP text
    try:
        font = ImageFont.truetype("DejaVuSans-Bold.ttf", random.randint(12,18))
    except:
        font = ImageFont.load_default()
    bbox = font.getbbox("STOP")
    tw, th = bbox[2]-bbox[0], bbox[3]-bbox[1]
    d.text((cx-tw/2, cy-th/2), "STOP", fill=(255,255,255,255), font=font)
    sign = sign.rotate(random.uniform(-25,25), expand=False)
    img.paste(sign, (0,0), sign)

def draw_obstacle(img):
    w,h = img.size
    cx = w//2 + random.randint(-18,18)
    cy = h//2 + random.randint(-18,18)
    r = random.randint(10,28)
    obs = Image.new("RGBA", img.size, (0,0,0,0))
    d = ImageDraw.Draw(obs)
    color = (90+random.randint(-25,25), 90+random.randint(-25,25), 90+random.randint(-25,25), 255)
    d.ellipse([(cx-r,cy-r),(cx+r,cy+r)], fill=color)
    # Optional stripes
    if random.random() < 0.5:
        d.line([(cx-r, cy), (cx+r,cy)], width=max(1, r//6), fill=(60,60,60,255))
    obs = obs.rotate(random.uniform(-10,10), expand=False)
    img.paste(obs, (0,0), obs)

def draw_random_distractors(img, n=2):
    w,h = img.size
    for _ in range(n):
        typ = random.choice(["circle","tri","rect","small_arrow"])
        layer = Image.new("RGBA", img.size, (0,0,0,0))
        d = ImageDraw.Draw(layer)
        cx = random.randint(10,w-10); cy = random.randint(10,h-10)
        s = random.randint(6,24)
        color = (random.randint(50,200), random.randint(50,200), random.randint(50,200), 200)
        if typ == "circle":
            d.ellipse([(cx-s,cy-s),(cx+s,cy+s)], fill=color)
        elif typ == "tri":
            d.polygon([(cx,cy-s),(cx-s,cy+s),(cx+s,cy+s)], fill=color)
        elif typ == "rect":
            d.rectangle([(cx-s,cy-s),(cx+s,cy+s)], fill=color)
        elif typ == "small_arrow":
            # Small arrow as distractor
            draw_arrow(layer, random.choice([0,90,180,270]), color=(30,30,30), thickness=4, length=18)
        img.paste(layer, (0,0), layer)

def occlude_random(img):
    w,h = img.size
    if random.random() < 0.35:
        layer = Image.new("RGBA", img.size, (0,0,0,0))
        d = ImageDraw.Draw(layer)
        x1 = random.randint(0, w//2); y1 = random.randint(0, h//2)
        x2 = random.randint(w//2, w); y2 = random.randint(h//2, h)
        d.rectangle([(x1,y1),(x2,y2)], fill=(random.randint(100,200),)*3 + (255,))
        img.paste(layer, (0,0), layer)

In [5]:
# Synthetic scene generator
def generate_scene(action, make_contradictory=False):
    img = make_background(IMG_SIZE, IMG_SIZE)
    # Add lane markings sometimes
    if random.random() < 0.5:
        draw = ImageDraw.Draw(img)
        for i in range(0, IMG_SIZE, 20):
            x = IMG_SIZE//2 + random.randint(-6,6)
            draw.rectangle([(x, i), (x+3, i+10)], fill=(230,230,230))
    # Place primary object (will add distractors too)
    if action == "turn_left":
        draw_arrow(img, 180, color=(10,10,10))
    elif action == "turn_right":
        draw_arrow(img, 0, color=(10,10,10))
    elif action == "go_straight":
        draw_arrow(img, 270, color=(10,10,10))
    elif action == "stop":
        draw_stop_sign(img)
    elif action == "slow_down":
        draw_obstacle(img)
    # Add distractors and multiple objects
    if random.random() < 0.7:
        draw_random_distractors(img, n=random.randint(0,2))
    # Sometimes add an extra driving-related object (to make multi-object scenes)
    if random.random() < 0.35:
        extra = random.choice(["stop","turn_left","turn_right","slow_down"])
        if extra == "stop":
            draw_stop_sign(img)
        elif extra == "turn_left":
            draw_arrow(img, 180)
        elif extra == "turn_right":
            draw_arrow(img, 0)
        else:
            draw_obstacle(img)
    # Occlusion and noise
    occlude_random(img)
    if random.random() < 0.5:
        img = img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0,1.8)))
    img = add_gaussian_noise(img, sigma=random.randint(3,10))
    # Random affine perspective
    if random.random() < 0.5:
        img = random_affine(img)
    return img

In [6]:
# Dataset generation with instruction logic
def generate_dataset(n_samples=DATASET_SIZE, out_dir=IMG_DIR):
    samples = []
    for i in range(n_samples):
        # Create class
        action = random.choice(ACTIONS)
        # Decide whether this sample will be contradictory (language overrides vision)
        is_contradictory = (random.random() < 0.05)  # ~5% contradictory
        img = generate_scene(action)
        # Choose instruction: sometimes contradictory, sometimes normal, sometimes more verbose
        instr = random.choice(INSTR_TEMPLATES[action])
        if random.random() < 0.15:
            # Verbose paraphrase
            instr = instr + ". " + random.choice(["Careful.", "Proceed with caution.", "Follow instruction."])
        if is_contradictory:
            # Pick a different ground-truth action that the instruction mandates
            alt = random.choice([a for a in ACTIONS if a != action])
            # Make instruction assert alt but maybe with negation flips
            base = random.choice(INSTR_TEMPLATES[alt])
            # Sometimes prepend negation to create complex language
            if random.random() < 0.5:
                instr = random.choice(NEGATION_TEMPLATES).format(base)
            else:
                instr = base
            ground_truth_action = alt
        else:
            ground_truth_action = action
        fname = f"img_{i:05d}.png"
        path = os.path.join(out_dir, fname)
        img.save(path)
        samples.append({"image": fname, "instruction": instr, "action": ground_truth_action, "orig_visual": action, "contradictory": is_contradictory})
    return samples

print("Generating dataset (this may take a moment)...")
samples = generate_dataset()
with open(os.path.join(DATA_DIR, "manifest.json"), "w") as f:
    json.dump(samples, f, indent=2)
print("Saved manifest with", len(samples), "samples to", DATA_DIR)

Generating dataset (this may take a moment)...
Saved manifest with 1000 samples to vla_benchmark_output\dataset


In [7]:
# Dataset class with preprocessing contained
class VLADataset(Dataset):
    def __init__(self, samples, img_dir, max_tokens=12):
        self.samples = samples
        self.img_dir = img_dir
        self.max_tokens = max_tokens
        self.vocab = self.build_vocab(samples)
        
    def build_vocab(self, samples):
        vocab = {"<pad>":0, "<unk>":1}
        idx = 2
        for s in samples:
            tokens = s["instruction"].lower().replace(",", "").replace(".", "").split()
            for t in tokens:
                if t not in vocab:
                    vocab[t] = idx; idx += 1
        return vocab
    
    def encode_text(self, text):
        toks = text.lower().replace(",", "").replace(".", "").split()
        ids = [self.vocab.get(t, self.vocab["<unk>"]) for t in toks][:self.max_tokens]
        if len(ids) < self.max_tokens:
            ids += [self.vocab["<pad>"]] * (self.max_tokens - len(ids))
        return torch.tensor(ids, dtype=torch.long)
    
    def preprocess_image(self, path):
        img = Image.open(path).convert("RGB").resize((IMG_SIZE, IMG_SIZE))
        arr = np.array(img).astype(np.float32)/255.0
        arr = (arr - 0.5)/0.5
        return torch.tensor(arr).permute(2,0,1)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        s = self.samples[idx]
        img_t = self.preprocess_image(os.path.join(self.img_dir, s["image"]))
        text_ids = self.encode_text(s["instruction"])
        label = ACTIONS.index(s["action"])
        return img_t, text_ids, label, s

In [8]:
# Create train & validation dataloaders
dataset = VLADataset(samples, IMG_DIR)
train_n = int(0.8 * len(dataset)); val_n = len(dataset) - train_n
train_ds, val_ds = random_split(dataset, [train_n, val_n], generator=torch.Generator().manual_seed(SEED))
BATCH_SIZE = 64
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
print(f"Dataset ready. Vocab size: {len(dataset.vocab)}. Train/Val sizes: {len(train_ds)}/{len(val_ds)}")

Dataset ready. Vocab size: 44. Train/Val sizes: 800/200


In [9]:
# Models: small CNN, tiny text encoder, vision-only model, fused model
class SmallCNN(nn.Module):
    def __init__(self, out_dim=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 32, 5, stride=2, padding=2), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, stride=1, padding=1), nn.ReLU(), nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, stride=1, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(), nn.Linear(128, out_dim), nn.ReLU()
        )
    def forward(self, x):
        return self.net(x)

class TinyTextEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, max_tokens=12):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.max_tokens = max_tokens
    
    def forward(self, x):
        e = self.emb(x)  # (B, T, emb)
        return e.mean(dim=1)  # (B, emb)

class VisionOnlyModel(nn.Module):
    def __init__(self, num_actions=len(ACTIONS)):
        super().__init__()
        self.vision = SmallCNN(out_dim=256)
        self.classifier = nn.Sequential(nn.Linear(256,128), nn.ReLU(), nn.Linear(128, num_actions))
    
    def forward(self, img, text_ids=None):
        v = self.vision(img)
        return self.classifier(v)

class FusedVLA(nn.Module):
    def __init__(self, vocab_size, vision_dim=256, text_dim=128, num_actions=len(ACTIONS)):
        super().__init__()
        self.vision = SmallCNN(out_dim=vision_dim)
        self.text = TinyTextEncoder(vocab_size, text_dim)
        fusion_dim = vision_dim + text_dim
        self.fuse = nn.Sequential(nn.Linear(fusion_dim, 512),
                                  nn.ReLU(),
                                  nn.Dropout(0.3),
                                  nn.Linear(512,256),
                                  nn.ReLU(),
                                  nn.Dropout(0.2),
                                  nn.Linear(256,128),
                                  nn.ReLU())

        self.classifier = nn.Linear(128, num_actions)
    
    def forward(self, img, text_ids):
        v = self.vision(img)
        t = self.text(text_ids)
        x = torch.cat([v,t], dim=1)
        x = self.fuse(x)
        return self.classifier(x)

In [10]:
# Training & evaluation utilities
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def train_model(model, train_loader, val_loader, epochs=12, lr=1e-3, name="model"):
    model = model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    crit = nn.CrossEntropyLoss()
    history = {"train_loss":[], "train_acc":[], "val_loss":[], "val_acc":[]}
    for ep in range(1, epochs+1):
        # Train
        model.train()
        t_loss = 0.0; t_preds=[]; t_labels=[]
        for imgs, txts, labels, _ in train_loader:
            imgs = imgs.to(device); txts = txts.to(device); labels = labels.to(device)
            opt.zero_grad()
            logits = model(imgs, txts) if txts is not None else model(imgs)
            loss = crit(logits, labels)
            loss.backward(); opt.step()
            t_loss += loss.item() * imgs.size(0)
            t_preds += logits.argmax(dim=1).cpu().tolist()
            t_labels += labels.cpu().tolist()
        train_loss = t_loss / len(train_loader.dataset)
        train_acc = accuracy_score(t_labels, t_preds)

        # Validate
        model.eval()
        v_loss = 0.0; v_preds=[]; v_labels=[]
        with torch.no_grad():
            for imgs, txts, labels, _ in val_loader:
                imgs = imgs.to(device); txts = txts.to(device); labels = labels.to(device)
                logits = model(imgs, txts) if txts is not None else model(imgs)
                loss = crit(logits, labels)
                v_loss += loss.item() * imgs.size(0)
                v_preds += logits.argmax(dim=1).cpu().tolist()
                v_labels += labels.cpu().tolist()
        val_loss = v_loss / len(val_loader.dataset)
        val_acc = accuracy_score(v_labels, v_preds)
        history["train_loss"].append(train_loss); history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss); history["val_acc"].append(val_acc)
        print(f"{name} Epoch {ep}/{epochs}  train_loss={train_loss:.4f} train_acc={train_acc:.4f}  val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
    # Save model
    ckpt = os.path.join(ARTIFACTS_DIR, f"{name}.pth")
    torch.save({"state":model.state_dict(), "vocab": dataset.vocab}, ckpt)
    print("Saved", ckpt)
    return model, history

def evaluate_and_plot(model, loader, title_prefix="model_eval", name="model"):
    model.eval()
    preds=[]; labels=[]
    samples_info=[]
    with torch.no_grad():
        for imgs, txts, lbls, meta in loader:
            imgs = imgs.to(device); txts = txts.to(device); lbls = lbls.to(device)
            logits = model(imgs, txts) if txts is not None else model(imgs)
            preds += logits.argmax(dim=1).cpu().tolist()
            labels += lbls.cpu().tolist()
            samples_info += meta
    acc = accuracy_score(labels, preds)
    cm = confusion_matrix(labels, preds, labels=list(range(len(ACTIONS))))
    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(6,5))
    im = ax.imshow(cm, interpolation='nearest')
    ax.set_xticks(range(len(ACTIONS))); ax.set_yticks(range(len(ACTIONS)))
    ax.set_xticklabels(ACTIONS, rotation=45, ha='right'); ax.set_yticklabels(ACTIONS)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j,i, int(cm[i,j]), ha='center', va='center', color='white' if cm[i,j] > cm.max()/2 else 'black')
    plt.title(f"{title_prefix} (acc={acc:.3f})")
    plt.colorbar(im)
    out_png = os.path.join(ARTIFACTS_DIR, f"{name}_confusion.png")
    plt.tight_layout(); plt.savefig(out_png); plt.close()
    print(f"Saved confusion matrix to {out_png} (acc={acc:.3f})")
    # Return raw metrics
    return {"acc": acc, "cm": cm, "preds": preds, "labels": labels, "samples_info": samples_info}

Using device: cuda


In [11]:
# Train 2 models: vision-only & fused
vmodel = VisionOnlyModel(num_actions=len(ACTIONS))
fmodel = FusedVLA(vocab_size=len(dataset.vocab), vision_dim=256, text_dim=128, num_actions=len(ACTIONS))

print("Training vision-only baseline...")
vmodel, vhist = train_model(vmodel, train_loader, val_loader, epochs=12, lr=1e-3, name="vision_only")

print("\nTraining fused VLA model...")
fmodel, fhist = train_model(fmodel, train_loader, val_loader, epochs=12, lr=1e-3, name="fused_vla")

Training vision-only baseline...
vision_only Epoch 1/12  train_loss=1.6024 train_acc=0.2462  val_loss=1.5727 val_acc=0.3000
vision_only Epoch 2/12  train_loss=1.4980 train_acc=0.3500  val_loss=1.4786 val_acc=0.3300
vision_only Epoch 3/12  train_loss=1.4344 train_acc=0.3300  val_loss=1.4346 val_acc=0.3300
vision_only Epoch 4/12  train_loss=1.4084 train_acc=0.3625  val_loss=1.4218 val_acc=0.3400
vision_only Epoch 5/12  train_loss=1.4064 train_acc=0.3650  val_loss=1.4421 val_acc=0.2800
vision_only Epoch 6/12  train_loss=1.4133 train_acc=0.3225  val_loss=1.4321 val_acc=0.2950
vision_only Epoch 7/12  train_loss=1.4087 train_acc=0.3762  val_loss=1.4375 val_acc=0.3350
vision_only Epoch 8/12  train_loss=1.3999 train_acc=0.3475  val_loss=1.4506 val_acc=0.2550
vision_only Epoch 9/12  train_loss=1.3932 train_acc=0.3700  val_loss=1.4258 val_acc=0.3350
vision_only Epoch 10/12  train_loss=1.3887 train_acc=0.3713  val_loss=1.4329 val_acc=0.3750
vision_only Epoch 11/12  train_loss=1.3718 train_acc=0.3

In [12]:
# # Create test dataloader
def generate_test_dataset(n_samples=200, out_dir=os.path.join(DATA_DIR, "test")):
    os.makedirs(out_dir, exist_ok=True)
    test_samples = []

    for i in range(n_samples):
        action = random.choice(ACTIONS)

        # Test images can be slightly different distribution
        img = generate_scene(action, make_contradictory=False)

        instr = random.choice(INSTR_TEMPLATES[action])

        fname = f"test_{i:05d}.png"
        img.save(os.path.join(out_dir, fname))

        test_samples.append({
            "image": fname,
            "instruction": instr,
            "action": action
        })
    
    manifest_path = os.path.join(DATA_DIR, "manifest_test.json")
    with open(manifest_path, "w") as f:
        json.dump(test_samples, f, indent=2)

    print(f"Generated test dataset with {len(test_samples)} samples at {out_dir}")
    return test_samples

test_samples = generate_test_dataset()
test_ds = VLADataset(test_samples, img_dir=os.path.join(DATA_DIR, "test"))
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)
print(f"Dataset ready. Test size: {len(test_ds)}")

Generated test dataset with 200 samples at vla_benchmark_output\dataset\test
Dataset ready. Test size: 200


In [13]:
# Evaluate & save plots
print("Evaluating models and saving confusion matrices...")
ve = evaluate_and_plot(vmodel, test_loader, title_prefix="Vision-only eval", name="vision_only")
fe = evaluate_and_plot(fmodel, test_loader, title_prefix="Fused VLA eval", name="fused_vla")

# Save a short note/artifact description
note = {
    "description": "VLA benchmark with synthetic but challenging dataset (distractors, contradictions, occlusions)",
    "dataset_manifest": os.path.join(DATA_DIR, "manifest.json"),
    "artifacts": {
        "vision_only_checkpoint": os.path.join(ARTIFACTS_DIR, "vision_only.pth"),
        "fused_vla_checkpoint": os.path.join(ARTIFACTS_DIR, "fused_vla.pth"),
        "confusion_matrices": [
            os.path.join(ARTIFACTS_DIR, "vision_only_confusion.png"),
            os.path.join(ARTIFACTS_DIR, "fused_vla_confusion.png")
        ]
    },
    "notes": "To upgrade, replace TinyTextEncoder and SmallCNN with pretrained models."
}
with open(os.path.join(ARTIFACTS_DIR, "note.json"), "w") as f:
    json.dump(note, f, indent=2)
print("Wrote note.json to", ARTIFACTS_DIR)

Evaluating models and saving confusion matrices...
Saved confusion matrix to vla_benchmark_output\artifacts\vision_only_confusion.png (acc=0.380)
Saved confusion matrix to vla_benchmark_output\artifacts\fused_vla_confusion.png (acc=0.410)
Wrote note.json to vla_benchmark_output\artifacts


In [14]:
# Where outputs are
print("Outputs:", OUT_ROOT)
print("Dataset manifest:", os.path.join(DATA_DIR, "manifest.json"))
print("Artifacts:", ARTIFACTS_DIR)

Outputs: vla_benchmark_output
Dataset manifest: vla_benchmark_output\dataset\manifest.json
Artifacts: vla_benchmark_output\artifacts
