In [8]:
output_dir = '../../Annotations'
temp_output_dir = './temp'

task = "classification" # "classification" or "segmentation"

root_dir = '/standard/storage/EgoExoEMS_CVPR2025/Dataset/Final'  # Replace with your directory path

# output_file = f'{temp_output_dir}/main_annotation.json' # temp
output_file = f'{output_dir}/main_annotation_{task}.json'


In [9]:
import json


# Load JSON data from file
data = json.loads(open(output_file).read())

# Iterate through the trials of a specific subject (e.g., 'ng1')

print("Total subjects: ", len(data['subjects']))   


for subject in data['subjects']:
    print("*" * 50)
    subject_id = subject['subject_id']
    trials = subject['trials']
    print(f"Subject ID: {subject_id}")
    print(f"Subject expertise level: {subject['expertise_level']}")
    print(f"Total trials: {len(trials)}")
    
    for trial in trials:
        print(f"Trial ID: {trial['trial_id']}")
        print(f"Total keysteps: {len(trial['keysteps'])}")
        # for stream_type, stream_data in trial_data.items():
        #     try:
        #         print(f"  Stream Type: {stream_type}")
        #         print(f"    File ID: {stream_data['file_id']}")
        #         print(f"    File Path: {stream_data['file_path']}")
        #     except KeyError:
        #         print(f"    No file data found for stream type '{stream_type} in trial '{trial_id}' of subject '{subject_id}'")
    print("*" * 50)

Total subjects:  15
**************************************************
Subject ID: ms1
Subject expertise level: EMT
Total trials: 9
Trial ID: 0
Total keysteps: 11
Trial ID: 1
Total keysteps: 18
Trial ID: 2
Total keysteps: 17
Trial ID: 3
Total keysteps: 20
Trial ID: 4
Total keysteps: 10
Trial ID: 5
Total keysteps: 19
Trial ID: 6
Total keysteps: 21
Trial ID: 7
Total keysteps: 15
Trial ID: 8
Total keysteps: 15
**************************************************
**************************************************
Subject ID: ms2
Subject expertise level: EMT
Total trials: 5
Trial ID: 0
Total keysteps: 19
Trial ID: 1
Total keysteps: 19
Trial ID: 2
Total keysteps: 18
Trial ID: 3
Total keysteps: 17
Trial ID: 4
Total keysteps: 19
**************************************************
**************************************************
Subject ID: ng1
Subject expertise level: EMT
Total trials: 2
Trial ID: 0
Total keysteps: 11
Trial ID: 1
Total keysteps: 24
**********************************************

## Find Keystep Distribution

In [47]:
import json
import pandas as pd

def get_keystep_distribution(subjects):
    keysteps_data = []

    # Extract keystep information, including length, from all subjects and trials
    for subject in subjects:
        for trial in subject['trials']:
            for keystep in trial['keysteps']:
                # Calculate the length of each keystep in seconds
                keystep_length = keystep['end_t'] - keystep['start_t']
                keysteps_data.append({
                    'label': keystep['label'],
                    'class_id': keystep['class_id'],
                    'length': keystep_length
                })

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(keysteps_data)

    # Get the distribution of keysteps by label and class_id
    label_distribution = df['label'].value_counts()
    class_distribution = df['class_id'].value_counts()

    # Calculate statistics (mean, min, max, std) for keystep length by label and class_id
    length_stats_by_label = df.groupby('label')['length'].agg(['mean', 'min', 'max', 'std']).rename(columns={
        'mean': 'average_length', 'min': 'min_length', 'max': 'max_length', 'std': 'std_length'
    })
    length_stats_by_class = df.groupby('class_id')['length'].agg(['mean', 'min', 'max', 'std']).rename(columns={
        'mean': 'average_length', 'min': 'min_length', 'max': 'max_length', 'std': 'std_length'
    })

    return label_distribution, class_distribution, length_stats_by_label, length_stats_by_class

# Load the JSON file
with open(output_file, 'r') as f:
    data = json.load(f)

# Get the keystep distribution and length statistics
label_distribution, class_distribution, length_stats_by_label, length_stats_by_class = get_keystep_distribution(data['subjects'])

# Display the results
print("Keystep Distribution by Label:")
print(label_distribution)

print("\nKeystep Distribution by Class ID:")
print(class_distribution)

print("\nKeystep Length Statistics by Label (seconds):")
print(length_stats_by_label)

print("\nKeystep Length Statistics by Class ID (seconds):")
print(length_stats_by_class)

# Save the results to CSV files
label_distribution.to_csv('./keystep_distribution_by_label.csv')
class_distribution.to_csv('./keystep_distribution_by_class_id.csv')
length_stats_by_label.to_csv('./keystep_length_stats_by_label.csv')
length_stats_by_class.to_csv('./keystep_length_stats_by_class_id.csv')


Keystep Distribution by Label:
label
chest_compressions             292
ventilate_patient              270
place_bvm                      200
check_pulse                    103
attach_defib_pads               81
clear_for_analysis              78
administer_shock_aed            77
clear_for_shock                 74
check_responsiveness            68
turn_on_aed                     65
no_action                       64
check_breathing                 64
approach_patient                44
request_aed                     29
request_assistance              25
assess_patient                  15
obtain_ecg_recording            13
explain_procedure               13
open_airway                     13
place_right_arm_lead            12
place_left_arm_lead             11
place_left_leg_lead             11
place_right_leg_lead            11
place_v2_lead                   11
place_v1_lead                   11
place_v4_lead                   11
place_v3_lead                   11
place_v6_lead     

## Keystep distributions for different expertise levels

In [17]:
import json
import pandas as pd

def get_keystep_distribution(subjects):
    keysteps_data = []

    # Extract keystep information from all subjects, trials, and expertise levels
    for subject in subjects:
        expertise_level = subject['expertise_level']
        for trial in subject['trials']:
            for keystep in trial['keysteps']:
                keysteps_data.append({
                    'label': keystep['label'],
                    'class_id': keystep['class_id'],
                    'expertise_level': expertise_level
                })

    # Convert to DataFrame for easier analysis
    df = pd.DataFrame(keysteps_data)

    # Get the distribution of keysteps by label and expertise level
    label_distribution = df.groupby(['expertise_level', 'label']).size().unstack(fill_value=0)
    class_distribution = df.groupby(['expertise_level', 'class_id']).size().unstack(fill_value=0)

    return label_distribution, class_distribution

# Load the JSON file
with open(output_file, 'r') as f:
    data = json.load(f)

# Get the keystep distribution
label_distribution, class_distribution = get_keystep_distribution(data['subjects'])

# Display the results
print("Keystep Distribution by Label and Expertise Level:")
print(label_distribution)

print("\nKeystep Distribution by Class ID and Expertise Level:")
print(class_distribution)

# Save the results to a CSV file
label_distribution.to_csv(f'./keystep_distribution_by_label_and_expertise_level.csv')
class_distribution.to_csv(f'./keystep_distribution_by_class_id_and_expertise_level.csv')

Keystep Distribution by Label and Expertise Level:
label            administer_shock_aed  approach_patient  ask_patient_age_sex  \
expertise_level                                                                
EMT                                64                40                   11   
Not certified                      13                 4                    0   

label            assess_patient  attach_defib_pads  check_breathing  \
expertise_level                                                       
EMT                          15                 67               59   
Not certified                 0                 14                5   

label            check_pulse  check_responsiveness  chest_compressions  \
expertise_level                                                          
EMT                       90                    64                 245   
Not certified             13                     4                  47   

label            clear_for_analysis  ...  plac

In [18]:
import json
import pandas as pd

def get_trial_and_keystep_counts(subjects):
    trial_data = []
    keystep_data = []

    # Extract trial and keystep information along with expertise levels
    for subject in subjects:
        expertise_level = subject['expertise_level']
        trial_count = len(subject['trials'])
        
        # Record each trial per expertise level
        trial_data.append({
            'expertise_level': expertise_level,
            'trial_count': trial_count
        })
        
        # Record each keystep per trial per expertise level
        for trial in subject['trials']:
            for keystep in trial['keysteps']:
                keystep_data.append({
                    'expertise_level': expertise_level,
                    'keystep': keystep['label']
                })

    # Convert to DataFrames for easier analysis
    trial_df = pd.DataFrame(trial_data)
    keystep_df = pd.DataFrame(keystep_data)

    # Calculate total trials per expertise level
    total_trials_per_expertise = trial_df.groupby('expertise_level')['trial_count'].sum()

    # Calculate total keysteps per expertise level
    total_keysteps_per_expertise = keystep_df.groupby('expertise_level').size()

    return total_trials_per_expertise, total_keysteps_per_expertise

# Load the JSON file
with open(output_file, 'r') as f:
    data = json.load(f)

# Get total trials and keysteps per expertise level
total_trials_per_expertise, total_keysteps_per_expertise = get_trial_and_keystep_counts(data['subjects'])

# Display the results
print("Total Trials per Expertise Level:")
print(total_trials_per_expertise)

print("\nTotal Keysteps per Expertise Level:")
print(total_keysteps_per_expertise)

# Save the results to CSV files
total_trials_per_expertise.to_csv('./total_trials_per_expertise_level.csv', header=True)
total_keysteps_per_expertise.to_csv('./total_keysteps_per_expertise_level.csv', header=True)


Total Trials per Expertise Level:
expertise_level
EMT              73
Not certified    14
Name: trial_count, dtype: int64

Total Keysteps per Expertise Level:
expertise_level
EMT              1502
Not certified     232
dtype: int64


## Get scenario distribution

In [48]:
import json


# Load JSON data from file
data = json.loads(open(output_file).read())

# Iterate through the trials of a specific subject (e.g., 'ng1')

print("Total subjects: ", len(data['subjects']))   
scenario_data = []


for subject in data['subjects']:
    print("*" * 50)
    subject_id = subject['subject_id']
    trials = subject['trials']
    print(f"Subject ID: {subject_id}")
    print(f"Subject expertise level: {subject['expertise_level']}")
    print(f"Total trials: {len(trials)}")
    
    for trial in trials:
        # print(f"Trial ID: {trial['trial_id']}")
        # print(f"Total keysteps: {len(trial['keysteps'])}")
        scenario_data.append({
            'subject_id': subject_id,
            'subject_expertise_level': subject['expertise_level'],
            'trial_id': trial['trial_id'],
            'scenario': trial['scenario']
        })
    print("*" * 50)

# Convert to DataFrame for easier analysis
df = pd.DataFrame(scenario_data)
# save to csv
df.to_csv(f'./scenario_data.csv', index=False)

Total subjects:  15
**************************************************
Subject ID: ms1
Subject expertise level: EMT
Total trials: 9
**************************************************
**************************************************
Subject ID: ms2
Subject expertise level: EMT
Total trials: 5
**************************************************
**************************************************
Subject ID: ng1
Subject expertise level: EMT
Total trials: 2
**************************************************
**************************************************
Subject ID: ng2
Subject expertise level: EMT
Total trials: 2
**************************************************
**************************************************
Subject ID: ng3
Subject expertise level: EMT
Total trials: 13
**************************************************
**************************************************
Subject ID: ng4
Subject expertise level: EMT
Total trials: 5
**************************************************
*

In [45]:
import json
import pandas as pd

# Load JSON data from file
data = json.loads(open(output_file).read())

# Initialize lists to store scenario and keystep data
scenario_data = []
keystep_data = []

# Iterate through subjects to extract scenario and keystep information
for subject in data['subjects']:
    subject_id = subject['subject_id']
    expertise_level = subject['expertise_level']
    
    for trial in subject['trials']:
        trial_id = trial['trial_id']
        scenario = None

        # Determine the scenario based on file path
        for stream_type, stream_data in trial.items():
            try:
                if stream_type == 'streams':
                    for stream in stream_data:
                        if stream == "egocam_rgb_audio":
                            file_path = trial['streams'][stream]['file_path']
                            scenario = file_path.split('/')[8]
                            if scenario == 'cardiac_scenario':
                                scenario = 'chest_pain'
            except KeyError:
                print(f"    No file data found for stream type '{stream_type}' in trial of subject '{subject_id}'")

        # Append scenario information
        scenario_data.append({
            'subject_id': subject_id,
            'expertise_level': expertise_level,
            'trial_id': trial_id,
            'scenario': scenario
        })

        # Extract keystep data for each scenario and expertise level
        if scenario:
            for keystep in trial['keysteps']:
                keystep_data.append({
                    'subject_id': subject_id,
                    'expertise_level': expertise_level,
                    'trial_id': trial_id,
                    'scenario': scenario,
                    'keystep_label': keystep['label'],
                    'class_id': keystep['class_id']
                })

# Convert data to DataFrames for easier analysis
scenario_df = pd.DataFrame(scenario_data)
keystep_df = pd.DataFrame(keystep_data)

# Calculate keystep distribution by scenario and expertise level
keystep_distribution = keystep_df.groupby(['scenario', 'expertise_level', 'keystep_label']).size().reset_index(name='count')

# Calculate total keysteps per scenario and expertise level
total_keysteps = keystep_df.groupby(['scenario', 'expertise_level']).size().reset_index(name='total_keysteps')

# Save to CSV files
scenario_df.to_csv('./scenario_data.csv', index=False)
keystep_distribution.to_csv('./keystep_distribution_by_scenario_and_expertise.csv', index=False)
total_keysteps.to_csv('./total_keysteps_per_scenario_and_expertise.csv', index=False)

# Display the results
print("Keystep Distribution by Scenario and Expertise Level:")
print(keystep_distribution)

print("\nTotal Keysteps per Scenario and Expertise Level:")
print(total_keysteps)

Keystep Distribution by Scenario and Expertise Level:
          scenario expertise_level                keystep_label  count
0   cardiac_arrest             EMT         administer_shock_aed     64
1   cardiac_arrest             EMT             approach_patient     31
2   cardiac_arrest             EMT            attach_defib_pads     67
3   cardiac_arrest             EMT              check_breathing     59
4   cardiac_arrest             EMT                  check_pulse     90
5   cardiac_arrest             EMT         check_responsiveness     64
6   cardiac_arrest             EMT           chest_compressions    245
7   cardiac_arrest             EMT           clear_for_analysis     65
8   cardiac_arrest             EMT              clear_for_shock     61
9   cardiac_arrest             EMT                    no_action     52
10  cardiac_arrest             EMT                  open_airway     13
11  cardiac_arrest             EMT                    place_bvm    159
12  cardiac_arrest     

# Total length of dataset

In [50]:
import json

# Load JSON data from file
with open(output_file, 'r') as f:
    data = json.load(f)

# Initialize a variable to store the total length of the dataset
total_length = 0

# Iterate through all subjects, trials, and keysteps
for subject in data['subjects']:
    for trial in subject['trials']:
        for keystep in trial['keysteps']:
            # Calculate the length of each keystep and add to the total
            keystep_length = keystep['end_t'] - keystep['start_t']
            total_length += keystep_length

print("Total length of the entire dataset (in seconds):", total_length)
# in hours and minutes
total_length_hours = total_length // 3600
total_length_minutes = (total_length % 3600) // 60
print(f"Total length of the entire dataset: {total_length_hours} hours and {total_length_minutes} minutes")


Total length of the entire dataset (in seconds): 14373.003789999979
Total length of the entire dataset: 3.0 hours and 59.0 minutes
