In [None]:
#!/usr/bin/env python3
import json
import random
import os
from collections import defaultdict
import copy

# reproducibility
random_seed = 42
random.seed(random_seed)

# task and paths
task = "classification"  # or "segmentation"
task = "segmentation"  # or "segmentation"
input_json_path = f'../../Annotations/aaai26_main_annotation_{task}.json'
output_dir = '../../Annotations/splits/keysteps/'

# ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# load the full annotation
with open(input_json_path, 'r') as f:
    data = json.load(f)

# prepare split containers
train_set = {"subjects": []}
val_set   = {"subjects": []}
test_set  = {"subjects": []}

# split ratios
train_ratio = 0.6
val_ratio   = 0.2
test_ratio  = 0.2

def split_keysteps(keysteps):
    """Shuffle and split a list of keysteps into train/val/test."""
    ks = keysteps[:]  # copy
    random.shuffle(ks)
    n = len(ks)
    n_train = int(train_ratio * n)
    n_val   = int(val_ratio   * n)
    # remainder → test
    return (
        ks[:n_train],
        ks[n_train:n_train+n_val],
        ks[n_train+n_val:]
    )

def build_subject_entry(subject, scenario_map):
    """
    Given a subject dict and a mapping from scenario_id → [trial_dicts],
    build a new subject entry with nested scenarios.
    """
    entry = {
        "subject_id":      subject["subject_id"],
        "expertise_level": subject.get("expertise_level", ""),
        "scenarios":       []
    }
    for scen_id, trials in scenario_map.items():
        entry["scenarios"].append({
            "scenario_id": scen_id,
            "trials":      trials
        })
    return entry

# iterate subjects
for subject in data.get('subjects', []):
    # maps: scenario_id → list of split trials
    train_map = defaultdict(list)
    val_map   = defaultdict(list)
    test_map  = defaultdict(list)

    for scenario in subject.get('scenarios', []):
        scen_id = scenario['scenario_id']
        for trial in scenario.get('trials', []):
            ks_list = trial.get('keysteps', [])
            if not ks_list:
                continue

            train_ks, val_ks, test_ks = split_keysteps(ks_list)

            # helper to clone trial and set keysteps
            def make_trial_copy(ks_subset):
                tcopy = copy.deepcopy(trial)
                tcopy['keysteps'] = ks_subset
                return tcopy

            if train_ks:
                train_map[scen_id].append(make_trial_copy(train_ks))
            if val_ks:
                val_map[scen_id].append(make_trial_copy(val_ks))
            if test_ks:
                test_map[scen_id].append(make_trial_copy(test_ks))

    # build and append subject entries if they have any trials
    if train_map:
        train_set['subjects'].append(build_subject_entry(subject, train_map))
    if val_map:
        val_set['subjects'].append(build_subject_entry(subject, val_map))
    if test_map:
        test_set['subjects'].append(build_subject_entry(subject, test_map))

# helper to save each split
def save_split(split_data, name):
    path = os.path.join(output_dir, f'aaai26_{name}_split_{task}.json')
    with open(path, 'w') as f:
        json.dump(split_data, f, indent=4)
    print(f"Saved {name} split with {len(split_data['subjects'])} subjects to {path}")

# write out each split
save_split(train_set, 'train')
save_split(val_set,   'val')
save_split(test_set,  'test')


Saved train split with 46 subjects to ../../Annotations/splits/keysteps/aaai26_train_split_classification.json
Saved val split with 45 subjects to ../../Annotations/splits/keysteps/aaai26_val_split_classification.json
Saved test split with 62 subjects to ../../Annotations/splits/keysteps/aaai26_test_split_classification.json
