# Annotation Generation Tools

In [81]:
import json
import os
from collections import OrderedDict
import fnmatch


In [82]:
output_dir = '../../Annotations'
temp_output_dir = './temp'

task = "classification" # "classification" or "segmentation" or "cpr_quality"
task = "segmentation" # "classification" or "segmentation" or "cpr_quality"
task = "cpr_quality" # "classification" or "segmentation" or "cpr_quality"

# root_dir = '/standard/UVA-DSA/NIST EMS Project Data/EgoExoEMS_CVPR2025/Dataset/Final'  # Replace with your directory path
# root_dir = '/standard/UVA-DSA/NIST EMS Project Data/DataCollection_Spring_2025/CARS/organized'  # Replace with your directory path
root_dir = '/standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/'  # Replace with your directory path

# output_file = f'{temp_output_dir}/main_annotation.json' # temp
output_file = f'{output_dir}/aaai26_main_annotation_{task}.json'
# output_file = f'{output_dir}/main_annotation_{task}.json'


In [83]:

accepted_stream_types = [
    'ego', 'exo', 'clip_ego', 'clip_exo', 'resnet_ego', 'resnet_exo',
    'Kinect', 'kinect', 'GoPro', 'gopro', 'distance_sensor_data',
    'smartwatch_data', 'audio', 'i3d_flow', 'i3d_rgb', 'resnet50',
    'resnet50-exo'
]

def get_stream_type(directory_name):
    stream_mapping = {
        'Kinect': 'exocam_rgbd',
        'kinect': 'exocam_rgbd',
        'GoPro': 'egocam_rgb_audio',
        'gopro': 'egocam_rgb_audio',
        'ego': 'egocam_rgb_audio',
        'exo': 'exocam_rgbd',
        'distance_sensor_data': 'vl6180_ToF_depth',
        'smartwatch_data': 'smartwatch_imu',
    }
    return stream_mapping.get(directory_name, directory_name)

def parse_file(file_path, stream_type):
    """
    Decide whether to keep this file, and if so, return a dict of metadata.
    Supports a new 'exocam_rgb' stream for MP4s.
    """
    file_name = os.path.basename(file_path)
    file_id, ext = os.path.splitext(file_name)
    ext = ext.lower()

    # egocam: only .mp4 ending in deidentified or _720p and any .json
    if stream_type == 'egocam_rgb_audio':
        if ext == '.mp4' and not (file_id.endswith("rgb_final") or file_id.endswith("rgb_partial")):
            return None
        if ext not in ('.mp4', '.json'):
            return None

    # exocam_rgbd: only .hdf5 here
    elif stream_type == 'exocam_rgbd':
        if ext != '.hdf5':
            return None

    # exocam_rgb: only .mp4 (all of them)
    elif stream_type == 'exocam_rgb':
        if ext != '.mp4':
            return None

    elif stream_type == 'vl6180_ToF_depth':
        if ext != '.csv':
            return None

    elif stream_type == 'audio':
        if ext not in ('.mp3', '.wav'):
            return None

    elif stream_type == 'smartwatch_imu':
        if ext != '.csv' or 'sync' not in file_id:
            return None

    elif stream_type in ('i3d_flow', 'i3d_rgb'):
        if ext != '.npy':
            return None
        if stream_type == 'i3d_flow' and 'flow' not in file_id:
            return None
        if stream_type == 'i3d_rgb'  and 'rgb'  not in file_id:
            return None

    elif stream_type in ('resnet_ego', 'resnet_exo', 'resnet50', 'resnet50-exo', 'clip_ego', 'clip_exo'):
        if ext != '.npy':
            return None

    return {
        "file_id":   file_id,
        "file_path": file_path,
    }

def normalize_subject_key(subject_key):
    return subject_key

def process_files(files):
    has_txt = any(fnmatch.fnmatch(f, '*.txt') for f in files)
    has_jpg = any(fnmatch.fnmatch(f, '*.jpg') for f in files)
    if has_txt or has_jpg:
        files = [f for f in files if not f.endswith(('.txt','.jpg'))]
    return files

def process_directory(root_path):
    subjects = []
    for root, dirs, files in os.walk(root_path):
        parts = root.split(os.sep)
        if len(parts) != 9:
            continue

        subj_key   = normalize_subject_key(parts[-4])
        scen_id    = parts[-3]
        trial_id   = parts[-2]
        stream_name= parts[-1]

        if subj_key.startswith('ld') or stream_name not in accepted_stream_types:
            continue

        orig_stream_type = get_stream_type(stream_name)
        files = process_files(files)
        if not files:
            continue

        # --- subject, scenario, trial setup (unchanged) ---
        subj = next((s for s in subjects if s['subject_id']==subj_key), None)
        if not subj:
            subj = OrderedDict([("subject_id",subj_key),("expertise_level","EMT"),("scenarios",[])])
            subjects.append(subj)
        scen = next((sc for sc in subj['scenarios'] if sc['scenario_id']==scen_id), None)
        if not scen:
            scen = OrderedDict([("scenario_id",scen_id),("trials",[])])
            subj['scenarios'].append(scen)
        trial = next((t for t in scen['trials'] if t['trial_id']==trial_id), None)
        if not trial:
            trial = OrderedDict([("trial_id",trial_id),("streams",OrderedDict()),
                                 ("keysteps",OrderedDict()),("interventions",OrderedDict())])
            scen['trials'].append(trial)

        streams = trial['streams']

        # --- SPECIAL HANDLING FOR EXOCAM ---
        if orig_stream_type == 'exocam_rgbd':
            for fname in sorted(files):
                full = os.path.join(root, fname)
                _, ext = os.path.splitext(fname)
                ext = ext.lower()
                if ext == '.hdf5':
                    info = parse_file(full, 'exocam_rgbd')
                    if info:
                        streams.setdefault('exocam_rgbd', []).append(info)
                elif ext == '.mp4':
                    info = parse_file(full, 'exocam_rgb')
                    if info:
                        streams.setdefault('exocam_rgb', []).append(info)
            continue  # done this directory

        # --- ALL OTHER STREAMS AS BEFORE ---
        stream_type = orig_stream_type
        for fname in sorted(files):
            full = os.path.join(root, fname)
            info = parse_file(full, stream_type)
            if info:
                streams.setdefault(stream_type, []).append(info)

    # sort for consistency
    subjects.sort(key=lambda s:s['subject_id'])
    for subj in subjects:
        subj['scenarios'].sort(key=lambda sc:sc['scenario_id'])
        for scen in subj['scenarios']:
            scen['trials'].sort(key=lambda t:t['trial_id'])
    return subjects

def generate_json_structure(root_directory, version="v1.2025.07.07"):
    return OrderedDict([
        ("subjects", process_directory(root_directory)),
        ("version",  version)
    ])

In [84]:
! pwd

/sfs/gpfs/tardis/home/cjh9fw/Desktop/2024/repos/EgoExoEMS/Tools/annotation_generation


In [85]:

json_data = generate_json_structure(root_dir)

# sort the json structure
json_data = dict(sorted(json_data.items()))
with open(output_file, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)

print(f"JSON structure saved to {output_file}")


JSON structure saved to ../../Annotations/aaai26_main_annotation_cpr_quality.json


### Populate Key Steps using VIA Annotations

In [86]:
import json

def add_keysteps_to_json(
    existing_json,
    keystep_json_path,
    subject_id="ng1",
    scenario_id="cardiac_arrest",
    trial_id="1",
    gopro_file_name="gopro_1.mp4"
):
    # --- load annotations & mappings ---
    with open(keystep_json_path, 'r') as f:
        keystep_data = json.load(f)
    with open('./class_id_mappings.json', 'r') as f:
        class_id_mapping = json.load(f)

    print("Adding keysteps to the JSON structure")
    print(f"Subject ID: {subject_id}, Trial ID: {trial_id}, GoPro file name: {gopro_file_name}")

    # find which vid‐key in keystep_data['file'] matches your GoPro filename
    correct_vid_id = None
    for vid_key, file_meta in keystep_data.get('file', {}).items():
        print(f"Checking file meta: {file_meta.get('fname', '')} against {gopro_file_name}")
        if gopro_file_name in file_meta.get('fname', ''):
            print("Found matching keystep file entry:", file_meta['fname'])
            correct_vid_id = vid_key
            break
        if gopro_file_name.split('_')[0] in file_meta.get('fname', ''):
            print("Found matching keystep file entry (partial match):", file_meta['fname'])
            correct_vid_id = vid_key
            break

    # build the keysteps list
    keysteps = []
    for ks_id, meta in keystep_data.get('metadata', {}).items():
        # only include those for our video, if desired:
        print(f"Video ID {meta.get('vid')} VS correct_vid_id {correct_vid_id}")
        if correct_vid_id and meta.get('vid') != correct_vid_id:
            continue
        
        print(f"Processing keystep {ks_id} for video {meta}")

        try:
            start_t, end_t = meta['z']
            label = meta['av']['1']
        except Exception as e:
            print(f"  → skipping {ks_id}: missing z or av/1")
            continue

        if label not in class_id_mapping.get('keysteps', {}):
            print(f"  → skipping {ks_id}: “{label}” not a keystep")
            continue

        if task == "cpr_quality" and label != "chest_compressions":
            print(f"  → skipping {ks_id}: “{label}” not a keystep for CPR quality task")
            continue
            
        keysteps.append({
            "keystep_id": ks_id,
            "start_t":    start_t,
            "end_t":      end_t,
            "label":      label,
            "class_id":   class_id_mapping['keysteps'][label]
        })

    # now inject into the right trial in the new hierarchy
    found = False
    for subj in existing_json.get('subjects', []):
        if subj.get('subject_id') != subject_id:
            continue
        for scen in subj.get('scenarios', []):
            if scen.get('scenario_id') != scenario_id:
                continue
            for trial in scen.get('trials', []):
                if trial.get('trial_id') == trial_id:
                    trial['keysteps'] = keysteps
                    print(f"→ Inserted {len(keysteps)} keysteps into "
                          f"subject={subject_id}, scenario={scen['scenario_id']}, trial={trial_id}")
                    found = True
    if not found:
        print(f"Warning: no matching subject={subject_id} / trial={trial_id} found in existing_json")

    return existing_json


In [87]:

with open(output_file, 'r') as json_file:
    data = json.load(json_file)
    updated_json = data

    for subject in updated_json.get('subjects', []):
        print("*" * 50)
        # if subject['subject_id'] != 'cars_1': continue

        for scenario in subject.get('scenarios', []):
            print(f" Scenario: {scenario['scenario_id']}")
            
            for trial in scenario.get('trials', []):
                print(f"  Trial: {trial['trial_id']}")
                
                streams = trial.get('streams', {})
                # only care about GoPro streams
                if 'egocam_rgb_audio' not in streams:
                    continue

                annotation_file = None
                gopro_file_name = None

                for file in streams['egocam_rgb_audio']:
                    fp = file.get('file_path', '')

                    print(f"    File: {file['file_id']} @ {fp}")

                    # pick out the .json annotation
                    if fp.endswith('.json'):
                        annotation_file = fp
                        print(f"    Annotation: {file['file_id']} @ {fp}")

                    # pick out the matching .mp4
                    if fp.endswith('rgb_final.mp4') or fp.endswith('rgb_partial.mp4'):
                        gopro_file_name = file['file_id'] + '.mp4'
                        print(f"    GoPro video: {gopro_file_name}")

                

                if annotation_file and gopro_file_name:
                    # inject keysteps
                    updated_json = add_keysteps_to_json(
                        updated_json,
                        annotation_file,
                        subject_id=subject['subject_id'],
                        scenario_id=scenario['scenario_id'],
                        trial_id=trial['trial_id'],
                        gopro_file_name=gopro_file_name
                    )

                    # remove that .json from the stream listing
                    trial['streams']['egocam_rgb_audio'] = [
                        f for f in trial['streams']['egocam_rgb_audio']
                        if not f['file_path'].endswith('.json')
                    ]
                else:
                    print(f"  → no annotation file or GoPro video found for {subject['subject_id']} / {scenario['scenario_id']} / {trial['trial_id']}")

    # write back out
    with open(output_file, 'w') as json_file:
        json.dump(updated_json, json_file, indent=4)

    print(f"\nUpdated JSON saved to {output_file}")
    print("*" * 50)


**************************************************
 Scenario: cardiac_arrest
  Trial: s11
    File: P0_cardiacarrest_s11_annotation @ /standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/ego/P0_cardiacarrest_s11_annotation.json
    Annotation: P0_cardiacarrest_s11_annotation @ /standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/ego/P0_cardiacarrest_s11_annotation.json
    File: P0_cardiacarrest_s11_ego_rgb_partial @ /standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/ego/P0_cardiacarrest_s11_ego_rgb_partial.mp4
    GoPro video: P0_cardiacarrest_s11_ego_rgb_partial.mp4
Adding keysteps to the JSON structure
Subject ID: P0, Trial ID: s11, GoPro file name: P0_cardiacarrest_s11_ego_rgb_partial.mp4
Checking file meta: GH010251_encoded_trimmed.mp4 against P0_cardiacarrest_s11_ego_rgb_partial.mp4
Video ID 1 VS correct_vid_id None
Processing keystep 1_metadata for video {'vid': '1', 'flg': 0, 'z': [0, 57.8], 'xy': [], 

## Convert any list to singular objects in stream

In [88]:
with open(output_file, 'r') as json_file:
    updated_json = json.load(json_file)

    for subject in updated_json.get('subjects', []):
        print("*" * 50)
        # if subject['subject_id'] != 'ng8': continue

        for scenario in subject.get('scenarios', []):
            print(f" Scenario: {scenario['scenario_id']}")

            for trial in scenario.get('trials', []):
                print(f"  Trial: {trial['trial_id']}")

                # We need to iterate over a list of keys since we'll be modifying the dict
                for stream_type in list(trial.get('streams', {}).keys()):
                    files = trial['streams'][stream_type]
                    print(f"   {stream_type}: {files} (count={len(files) if isinstance(files, list) else 'N/A'})")

                    # If it's a list, collapse it to just the first element
                    if isinstance(files, list) and files:
                        trial['streams'][stream_type] = files[0]
                        print(f"    → flattened to: {trial['streams'][stream_type]}")

    # Write the updated JSON back out
    with open(output_file, 'w') as json_file:
        json.dump(updated_json, json_file, indent=4)

    print(f"\nUpdated JSON saved to {output_file}")
    print("*" * 50)

**************************************************
 Scenario: cardiac_arrest
  Trial: s11
   exocam_rgb: [{'file_id': 'P0_cardiacarrest_s11_exo_rgb_final', 'file_path': '/standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/exo/P0_cardiacarrest_s11_exo_rgb_final.mp4'}] (count=1)
    → flattened to: {'file_id': 'P0_cardiacarrest_s11_exo_rgb_final', 'file_path': '/standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/exo/P0_cardiacarrest_s11_exo_rgb_final.mp4'}
   exocam_rgbd: [{'file_id': 'P0_cardiacarrest_s11_exo_rgbd_ir', 'file_path': '/standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/exo/P0_cardiacarrest_s11_exo_rgbd_ir.hdf5'}] (count=1)
    → flattened to: {'file_id': 'P0_cardiacarrest_s11_exo_rgbd_ir', 'file_path': '/standard/UVA-DSA/NIST EMS Project Data/EgoEMS_AAAI2026/P0/cardiac_arrest/s11/exo/P0_cardiacarrest_s11_exo_rgbd_ir.hdf5'}
   smartwatch_imu: [{'file_id': 'P0_cardiacarrest_s11_sync_smartwatch', '

# Update Expertise Levels

In [89]:
# load the json file
expertise_level_file = './subject_expertise_level.json'
# load data to a dictionary
with open(expertise_level_file) as json_file:
    expertise_level_data = json.load(json_file)['EXPERTISE_LEVELS']

print(expertise_level_data)

# iterate over the subjects and trials in main_annotation.json and update the expertise level
with open(output_file, 'r') as json_file:
    data = json.load(json_file)
    updated_json = data
    
    for subject in updated_json['subjects']:
        print("*" * 50)
        subject_id = subject['subject_id']
        expertise_level = expertise_level_data.get(subject_id, None)
        if expertise_level:
            subject['expertise_level'] = expertise_level
            print(f"Updated expertise level for subject {subject_id} to {expertise_level}")
        else:
            print(f"Expertise level not found for subject {subject_id}")

    with open(output_file, 'w') as json_file:
        json.dump(updated_json, json_file, indent=4)
    print(f"JSON structure saved to {output_file}")
    print("*" * 50)

{'ms1': 'EMT', 'ms2': 'EMT', 'ng1': 'EMT', 'ng2': 'EMT', 'ng3': 'EMT', 'ng4': 'EMT', 'ng5': 'EMT', 'ng6': 'EMT', 'ng7': 'Not certified', 'ng8': 'Not certified', 'ng9': 'Not certified', 'ng10': 'EMT', 'ng11': 'Not certified', 'wa0': 'Paramedic', 'wa1': 'EMT', 'wa2': 'EMT', 'wa4': 'EMT', 'wa5': 'EMT', 'wa6': 'EMT', 'wa7': 'EMT', 'P0': 'Not certified', 'P1': 'Not certified', 'P2': 'Not certified', 'P3': 'Not certified', 'P4': 'Not certified', 'P5': 'Not certified', 'P6': 'Not certified', 'P7': 'Not certified', 'P8': 'Not certified', 'P9': 'Not certified', 'P10': 'Not certified', 'P11': 'Not certified', 'P12': 'Not certified', 'P13': 'Not certified', 'P14': 'Not certified', 'P15': 'Not certified', 'P16': 'Not certified', 'P17': 'Not certified', 'P18': 'Not certified', 'P19': 'Not certified', 'P20': 'Not certified', 'P21': 'Not certified', 'P22': 'Not certified', 'P23': 'Not certified', 'opvrs_1': 'EMT', 'opvrs_2': 'EMT', 'opvrs_3': 'EMT', 'opvrs_4': 'EMT', 'opvrs_5': 'EMT', 'opvrs_6': 'EMT

# Remove trials that has no keysteps

In [90]:
# remove trials that have no keysteps
if task == "cpr_quality":  
    with open(output_file, 'r') as json_file:
        data = json.load(json_file)
        updated_json = data
        
        # Initialize an empty list to hold subjects with non-empty trials
        subjects_with_trials = []
        
        for subject in updated_json['subjects']:
            print("*" * 50)
            for scenario in subject.get('scenarios', []):
            
                # Remove trials with no keysteps
                scenario['trials'] = [trial for trial in scenario['trials'] if trial['keysteps']]
                
                if scenario['trials']:
                    subjects_with_trials.append(subject)
                    for trial in scenario['trials']:
                        print(f"Trial {trial['trial_id']} has keysteps")
                else:
                    print(f"Removing subject {subject['subject_id']} as all trials are empty")
            
            # Update the JSON structure to only include subjects with non-empty trials
            updated_json['subjects'] = subjects_with_trials
            
        # Save the modified JSON structure back to the file
        with open(output_file, 'w') as json_file:
            json.dump(updated_json, json_file, indent=4)
        print(f"JSON structure saved to {output_file}")
        print("*" * 50)


**************************************************
Trial s11 has keysteps
Trial s2 has keysteps
Trial s3 has keysteps
Trial s4 has keysteps
Trial s5 has keysteps
Trial s6 has keysteps
Trial s8 has keysteps
Trial s9 has keysteps
**************************************************
Trial s2 has keysteps
Trial s3 has keysteps
Trial s5 has keysteps
Trial s6 has keysteps
Trial s7 has keysteps
**************************************************
Trial s6 has keysteps
Trial s7 has keysteps
**************************************************
Trial s10 has keysteps
Trial s2 has keysteps
Trial s3 has keysteps
Trial s4 has keysteps
Trial s6 has keysteps
Trial s7 has keysteps
Trial s8 has keysteps
Trial s9 has keysteps
**************************************************
Trial s1 has keysteps
Trial s2 has keysteps
Trial s4 has keysteps
Trial s5 has keysteps
Trial s6 has keysteps
Trial s8 has keysteps
**************************************************
Trial s3 has keysteps
Trial s4 has keysteps
Trial s5 h

# Fill gaps of annotations with no_action

In [91]:

def fill_no_action_keysteps(trial):
    keysteps = sorted(trial.get("keysteps", []), key=lambda x: x["start_t"])
    if not keysteps:
        return
    no_action_class_id = 15
    filled = []

    for i in range(len(keysteps) - 1):
        curr = keysteps[i]
        nxt  = keysteps[i + 1]
        filled.append(curr)
        if nxt["start_t"] > curr["end_t"]:
            filled.append({
                "keystep_id": f"no_action_{i}",
                "start_t":    curr["end_t"],
                "end_t":      nxt["start_t"],
                "label":      "no_action",
                "class_id":   no_action_class_id
            })

    filled.append(keysteps[-1])
    trial["keysteps"] = filled

def process_subject_trials(subject):
    # now loop through scenarios first, then trials
    for scenario in subject.get("scenarios", []):
        for trial in scenario.get("trials", []):
            fill_no_action_keysteps(trial)
    return subject

def process_all_subjects(data):
    for subject in data.get("subjects", []):
        process_subject_trials(subject)
    return data

if task == "segmentation":
    # Load the JSON data
    with open(output_file, "r") as f:
        data = json.load(f)

    # Process all subjects → scenarios → trials
    updated = process_all_subjects(data)

    # Save the updated JSON with filled "no_action" keysteps
    with open(output_file, "w") as f:
        json.dump(updated, f, indent=4)

    print("Gaps filled with 'no_action' keysteps successfully.")


# Remove overlapping segments


In [92]:
# import json

# def remove_overlapping_keysteps(trials):
#     for trial in trials:
#         # Sort keysteps within each trial by their start time
#         trial["keysteps"] = sorted(trial["keysteps"], key=lambda x: x['start_t'])
#         non_overlapping_keysteps = []

#         for current_keystep in trial["keysteps"]:
#             if not non_overlapping_keysteps:
#                 non_overlapping_keysteps.append(current_keystep)
#             else:
#                 last_keystep = non_overlapping_keysteps[-1]
#                 # Check for overlap
#                 if current_keystep['start_t'] < last_keystep['end_t']:
#                     # Keep the larger keystep (the one with a longer duration)
#                     current_duration = current_keystep['end_t'] - current_keystep['start_t']
#                     last_duration = last_keystep['end_t'] - last_keystep['start_t']
#                     if current_duration > last_duration:
#                         non_overlapping_keysteps[-1] = current_keystep
#                 else:
#                     non_overlapping_keysteps.append(current_keystep)

#         # Replace the keysteps in the trial with the filtered ones
#         trial["keysteps"] = non_overlapping_keysteps


# if task == "segmentation":
    # # Load the JSON file
    # with open(output_file, 'r') as f:
    #     data = json.load(f)

    # # Process each subject's trials
    # for subject in data["subjects"]:
    #     remove_overlapping_keysteps(subject["trials"])

    # # Save the modified data back to a new JSON file
    # with open(output_file, 'w') as f:
    #     json.dump(data, f, indent=4)

    # print("Overlapping keysteps have been removed and saved to 'main_annotation_no_overlap.json'.")


In [93]:

def split_keystep(last_keystep, current_keystep):
    """
    Adjusts the last keystep by splitting it so that it does not overlap with the current keystep.
    """
    if current_keystep['start_t'] < last_keystep['end_t']:
        # Trim the end of the last keystep so it ends exactly when the current one begins
        last_keystep['end_t'] = current_keystep['start_t']
    return last_keystep

def remove_and_split_overlapping_keysteps(trials):
    """
    For each trial in the list, sort its keysteps by start time,
    then trim any overlap between successive keysteps.
    """
    for trial in trials:
        keysteps = sorted(trial.get("keysteps", []), key=lambda x: x['start_t'])
        non_overlapping = []

        for current in keysteps:
            if not non_overlapping:
                non_overlapping.append(current)
            else:
                last = non_overlapping[-1]
                # If they overlap, split the last one
                if current['start_t'] < last['end_t']:
                    last = split_keystep(last, current)
                    non_overlapping[-1] = last
                non_overlapping.append(current)

        trial["keysteps"] = non_overlapping

if task == "segmentation":
    # Load the JSON file
    with open(output_file, 'r') as f:
        data = json.load(f)

    # Walk through subjects → scenarios → trials
    for subject in data.get("subjects", []):
        for scenario in subject.get("scenarios", []):
            remove_and_split_overlapping_keysteps(scenario.get("trials", []))

    # Save the updated JSON back to disk
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=4)

    print("Overlapping keysteps have been removed and saved to", output_file)


# Load JSON and Test

In [94]:
import json

# Load JSON data from file
with open(output_file, 'r') as f:
    data = json.load(f)

print("Total subjects:", len(data.get('subjects', [])))

for subject in data.get('subjects', []):
    print("*" * 50)
    subject_id = subject.get('subject_id')
    scenarios = subject.get('scenarios', [])
    print(f"Subject ID: {subject_id}")
    print(f"Total scenarios: {len(scenarios)}")
    
    for scenario in scenarios:
        scenario_id = scenario.get('scenario_id')
        trials = scenario.get('trials', [])
        print(f" Scenario ID: {scenario_id}")
        print(f"  Total trials: {len(trials)}")
        
        for trial in trials:
            trial_id = trial.get('trial_id')
            keysteps = trial.get('keysteps', [])
            print(f"   Trial ID: {trial_id}")
            print(f"    Total keysteps: {len(keysteps)}")
    
    print("*" * 50)


Total subjects: 28
**************************************************
Subject ID: P0
Total scenarios: 1
 Scenario ID: cardiac_arrest
  Total trials: 8
   Trial ID: s11
    Total keysteps: 1
   Trial ID: s2
    Total keysteps: 1
   Trial ID: s3
    Total keysteps: 1
   Trial ID: s4
    Total keysteps: 1
   Trial ID: s5
    Total keysteps: 1
   Trial ID: s6
    Total keysteps: 1
   Trial ID: s8
    Total keysteps: 1
   Trial ID: s9
    Total keysteps: 1
**************************************************
**************************************************
Subject ID: P1
Total scenarios: 1
 Scenario ID: cardiac_arrest
  Total trials: 5
   Trial ID: s2
    Total keysteps: 1
   Trial ID: s3
    Total keysteps: 1
   Trial ID: s5
    Total keysteps: 1
   Trial ID: s6
    Total keysteps: 1
   Trial ID: s7
    Total keysteps: 1
**************************************************
**************************************************
Subject ID: P10
Total scenarios: 1
 Scenario ID: cardiac_arrest
  Tot

In [95]:
import json
import pandas as pd

def get_keystep_distribution(subjects):
    keysteps_data = []

    # Extract keystep information from all subjects, scenarios, and trials
    for subject in subjects:
        for scenario in subject.get('scenarios', []):
            for trial in scenario.get('trials', []):
                for keystep in trial.get('keysteps', []):
                    keysteps_data.append({
                        'label':    keystep['label'],
                        'class_id': keystep['class_id']
                    })

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(keysteps_data)

    # Get the distribution of keysteps by label and by class_id
    label_distribution = df['label'].value_counts()
    class_distribution = df['class_id'].value_counts()

    return label_distribution, class_distribution

# Load the JSON file
with open(output_file, 'r') as f:
    data = json.load(f)

# Get the keystep distribution
label_distribution, class_distribution = get_keystep_distribution(data['subjects'])

# Display the results
print("Keystep Distribution by Label:")
print(label_distribution)

print("\nKeystep Distribution by Class ID:")
print(class_distribution)


Keystep Distribution by Label:
label
chest_compressions    614
Name: count, dtype: int64

Keystep Distribution by Class ID:
class_id
4    614
Name: count, dtype: int64


# Plot keysteps

In [96]:
import json
import matplotlib.pyplot as plt
import pandas as pd

def plot_keysteps_for_trial(trial, subject_id, scenario_id):
    keysteps_data = []

    # Extract keystep information from the trial
    for ks in trial.get('keysteps', []):
        keysteps_data.append({
            'label':    ks['label'],
            'start':    ks['start_t'],
            'end':      ks['end_t'],
            'duration': ks['end_t'] - ks['start_t']
        })

    # Convert to DataFrame for easier plotting
    df = pd.DataFrame(keysteps_data)

    plt.figure(figsize=(10, 6))
    for _, row in df.iterrows():
        plt.barh(row['label'], row['duration'], left=row['start'], height=0.4)

    # Add labels and title
    plt.xlabel('Time (seconds)')
    plt.ylabel('Keysteps')
    plt.title(f'Subject {subject_id} Scenario {scenario_id} Trial {trial["trial_id"]} Keystep Timeline')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def plot_keysteps_for_all_trials(subjects):
    for subject in subjects:
        subject_id = subject.get('subject_id')
        for scenario in subject.get('scenarios', []):
            scenario_id = scenario.get('scenario_id')
            for trial in scenario.get('trials', []):
                plot_keysteps_for_trial(trial, subject_id, scenario_id)
        break  # only plot first subject's trials

# with open(output_file, 'r') as f:
#     data = json.load(f)
# plot_keysteps_for_all_trials(data.get('subjects', []))


# Legacy: Dont Execute