In [1]:
import json
import random
import os

# Load the original annotation file
input_json_path = '../../Annotations/main_annotation.json'  # Replace with your input JSON file path
output_dir = '../../Annotations/splits/keysteps'  # Output directory for split files


# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the dataset
with open(input_json_path, 'r') as f:
    data = json.load(f)

# Create empty lists for train, val, and test
train_set = {"subjects": []}
val_set = {"subjects": []}
test_set = {"subjects": []}

# Split ratio for train, validation, and test
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2

# Function to split keysteps for a subject
def split_keysteps(keysteps):
    random.shuffle(keysteps)
    num_keysteps = len(keysteps)
    num_train = int(train_ratio * num_keysteps)
    num_val = int(val_ratio * num_keysteps)
    num_test = num_keysteps - num_train - num_val

    train_keysteps = keysteps[:num_train]
    val_keysteps = keysteps[num_train:num_train + num_val]
    test_keysteps = keysteps[num_train + num_val:]

    return train_keysteps, val_keysteps, test_keysteps

# Shuffle the data before splitting
for subject in data['subjects']:
    train_trials = []
    val_trials = []
    test_trials = []

    for trial in subject['trials']:
        if 'keysteps' in trial and trial['keysteps']:
            # Split keysteps for this trial
            train_keysteps, val_keysteps, test_keysteps = split_keysteps(trial['keysteps'])

            # Append keysteps to their respective trials
            if train_keysteps:
                new_train_trial = trial.copy()
                new_train_trial['keysteps'] = train_keysteps
                train_trials.append(new_train_trial)

            if val_keysteps:
                new_val_trial = trial.copy()
                new_val_trial['keysteps'] = val_keysteps
                val_trials.append(new_val_trial)

            if test_keysteps:
                new_test_trial = trial.copy()
                new_test_trial['keysteps'] = test_keysteps
                test_trials.append(new_test_trial)

    # Add trials to respective sets
    if train_trials:
        train_set['subjects'].append({
            "subject_id": subject["subject_id"],
            "expertise_level": subject["expertise_level"],
            "trials": train_trials
        })

    if val_trials:
        val_set['subjects'].append({
            "subject_id": subject["subject_id"],
            "expertise_level": subject["expertise_level"],
            "trials": val_trials
        })

    if test_trials:
        test_set['subjects'].append({
            "subject_id": subject["subject_id"],
            "expertise_level": subject["expertise_level"],
            "trials": test_trials
        })

# Save split JSON files
def save_split_file(split_data, split_name):
    with open(os.path.join(output_dir, f'{split_name}_split.json'), 'w') as f:
        json.dump(split_data, f, indent=4)

# Save the train, validation, and test splits
save_split_file(train_set, 'train')
save_split_file(val_set, 'val')
save_split_file(test_set, 'test')

print(f"Splits saved in directory: {output_dir}")


Splits saved in directory: ../../Annotations/splits/keysteps
