In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Function to sample uniformly
def uniform_sampling(group):
    # Sort the group by seconds
    group = group.sort_values(by='seconds')
    
    # Calculate the number of samples to take
    num_samples = min(2000, len(group))
    
    # Sample uniformly spaced indices
    indices = np.round(np.linspace(0, len(group) - 1, num_samples)).astype(int)
    
    # Select the rows using the sampled indices
    sampled_data = group.iloc[indices]
    
    return sampled_data

In [3]:
def np_encoder(object):
    if isinstance(object, np.generic):
        return object.item()


In [4]:
# Load JSON data into a pandas DataFrame
with open('ocarina_biomeasures_classified_hemodynamics_seconds.json', 'r') as file:
    data = json.load(file)

ocarina_df = pd.DataFrame(data)

# Group by unique combinations of subject_id and trial_id
ocarina_grouped = ocarina_df.groupby(['subject_id', 'trial_id'])


# Apply uniform sampling to each group
ocarina_sampled_df = ocarina_grouped.apply(uniform_sampling)

# Reset index to remove hierarchical index
ocarina_sampled_df.reset_index(drop=True, inplace=True)


ocarina_sampled_df['subject_id'] = ocarina_sampled_df['subject_id'].astype(int)
ocarina_sampled_df['trial_id'] = ocarina_sampled_df['trial_id'].astype(int)
ocarina_sampled_df['timestamp'] = ocarina_sampled_df['timestamp'].astype(int)
ocarina_sampled_df['workload_classification'] = ocarina_sampled_df['workload_classification'].astype(str)
ocarina_sampled_df['workload_confidence'] = ocarina_sampled_df['workload_confidence'].astype(float)

ocarina_sampled_df['attention_classification'] = ocarina_sampled_df['attention_classification'].astype(str)
ocarina_sampled_df['attention_confidence'] = ocarina_sampled_df['attention_confidence'].astype(float)

ocarina_sampled_df['perception_classification'] = ocarina_sampled_df['perception_classification'].astype(str)
ocarina_sampled_df['perception_confidence'] = ocarina_sampled_df['perception_confidence'].astype(float)

ocarina_sampled_df['seconds'] = ocarina_sampled_df['seconds'].astype(float)

ocarina_sampled_df['workload_confidence'].fillna(0, inplace=True)
ocarina_sampled_df['attention_confidence'].fillna(0, inplace=True)
ocarina_sampled_df['perception_confidence'].fillna(0, inplace=True)

FileNotFoundError: [Errno 2] No such file or directory: 'ocarina_biomeasures_classified_hemodynamics_seconds.json'

In [3]:
# Create the desired structure
final_data = []
for (subject_id, trial_id), group in ocarina_sampled_df.groupby(['subject_id', 'trial_id']):
    obj = {
        "subject_id": subject_id,
        "trial_id": trial_id,
        "data": group.drop(columns=['subject_id', 'trial_id']).to_dict(orient='records')
    }
    
    
    final_data.append(obj)

# Output the final JSON structure to a file
with open('FNIRS_sampled.json', 'w') as file:
    json.dump(final_data, file, indent=4, default=np_encoder)

In [54]:
gaze_df = pd.read_json('hl2_gaze_seconds.json')
imu_df =  pd.read_json('hl2_imu_seconds.json')

In [9]:
gaze_df_grouped = gaze_df.groupby(['subject_id', 'trial_id'])

gaze_df_sampled = gaze_df_grouped.apply(uniform_sampling)

gaze_df_sampled.reset_index(drop=True, inplace=True)

gaze_df_sampled.drop(['hit_x', 'hit_y', 'hit_z'], inplace=True, axis=1)

gaze_df_sampled.to_csv("gaze_sampled.csv", index=False)

In [7]:
imu_df_grouped = imu_df.groupby(['subject_id', 'trial_id'])

imu_df_sampled = imu_df_grouped.apply(uniform_sampling)

imu_df_sampled.reset_index(drop=True, inplace=True)

imu_df_sampled.drop(['orientation_covariance', 'angular_velocity_covariance', 'linear_acceleration_covariance', 'orientation_w' ], inplace=True, axis=1)

imu_df_sampled.to_csv("imu_sampled.csv", index= False)

In [15]:
imu_df_sampled['orientation_z'].quantile([0, 0.01,0.05 ,0.10, 0.25,0.5,0.75, 0.90, 0.95, 0.99, 1])

0.00   -294.300018
0.01   -285.150024
0.05   -280.950012
0.10   -279.000000
0.25   -276.000000
0.50   -273.150024
0.75   -269.850006
0.90   -266.700012
0.95   -264.750000
0.99   -260.400024
1.00   -237.750015
Name: orientation_z, dtype: float64

In [52]:
#fix erroneous seconds
mask = (gaze_df_sampled['subject_id'] == 8708) & (gaze_df_sampled['trial_id'] == 4)

# Subtract 83979.895 only from the selected rows
gaze_df_sampled.loc[mask, 'seconds'] -= 83979.895
gaze_df_sampled.to_csv("gaze_sampled.csv", index=False)

In [56]:
#fix erroneous seconds
mask = (imu_df_sampled['subject_id'] == 8708) & (imu_df_sampled['trial_id'] == 4)

# Subtract 83979.895 only from the selected rows
imu_df_sampled.loc[mask, 'seconds'] -= 83979.675
imu_df_sampled.to_csv("imu_sampled.csv", index=False)