<a href="https://colab.research.google.com/github/JustinBui/CECS-698-Data-Analysis/blob/main/1)_Data_Trimming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import numpy as np
from datetime import datetime, timedelta
import pytz

# Assessment start times

Timestamp from EmbracePlus's event tags are in microseconds (UNIX time UTC). Our goal is to convert this timestamp into ISO format (PST) to validate with the start times written down already during our studies to ensure tagging works fine. In addition, microseconds will be converted to milliseconds.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/start times.csv')[['tags', 'participant_id']]

In [None]:
# Convert `tags` from microseconds to milliseconds
df['tags'] = df['tags'].astype(int) // 1000

# Convert the rounded milliseconds to a human-readable datetime in PST (HH:MM:SS AM/PM)
df['tags_pst'] = pd.to_datetime(df['tags'], unit='ms', utc=True).dt.tz_convert('US/Pacific').dt.strftime('%I:%M:%S %p')

Modifying dataframe to have one participant per row (Each row has an easy and hard timestamp)

In [None]:
# Creating column for groups: Easy to Hard (E-H) and Hard to Easy (H-E)
group = {
    'E-H': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 24],
    'H-E': [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26]
}
df['group'] = df['participant_id'].apply(lambda x: 'E-H' if x in group['E-H'] else 'H-E')

# Get all participants from E-H group. Combining duplicate participant rows into 1
EH_df = df[df['group'] == 'E-H']
EH_df_easy = EH_df.loc[EH_df.groupby('participant_id')['tags'].idxmin()].rename(columns={'tags':'easy_milliseconds_start', 'tags_pst':'easy_pst_dt_start'}) # Easy task done first
EH_df_hard = EH_df.loc[EH_df.groupby('participant_id')['tags'].idxmax()].rename(columns={'tags':'hard_milliseconds_start', 'tags_pst':'hard_pst_dt_start'})

# Combining easy and hard assessments from E-H group
EH_df = pd.merge(EH_df_easy, EH_df_hard, on='participant_id', how='inner')
EH_df.rename(columns={'group_x':'group'}, inplace=True)
EH_df.drop('group_y', axis=1, inplace=True)
EH_df = EH_df[['participant_id', 'group', 'easy_milliseconds_start', 'easy_pst_dt_start', 'hard_milliseconds_start', 'hard_pst_dt_start']] # Rearrange columns


# Get all participants from H-E group. Combining duplicate participant rows into 1
HE_df = df[df['group'] == 'H-E']
HE_df_easy = HE_df.loc[HE_df.groupby('participant_id')['tags'].idxmax()].rename(columns={'tags':'easy_milliseconds_start', 'tags_pst':'easy_pst_dt_start'})
HE_df_hard = HE_df.loc[HE_df.groupby('participant_id')['tags'].idxmin()].rename(columns={'tags':'hard_milliseconds_start', 'tags_pst':'hard_pst_dt_start'}) # Hard task done first

# Combining easy and hard assessments from H-E group
HE_df = pd.merge(HE_df_easy, HE_df_hard, on='participant_id', how='inner')
HE_df.rename(columns={'group_x':'group'}, inplace=True)
HE_df.drop('group_y', axis=1, inplace=True)
HE_df = HE_df[['participant_id', 'group', 'easy_milliseconds_start', 'easy_pst_dt_start', 'hard_milliseconds_start', 'hard_pst_dt_start']] # Rearrange columns

# Combining dataframes together
df_start_times = pd.concat([EH_df, HE_df], ignore_index=True)
df_start_times.sort_values('participant_id')

Unnamed: 0,participant_id,group,easy_milliseconds_start,easy_pst_dt_start,hard_milliseconds_start,hard_pst_dt_start
0,4,E-H,1733272841738,04:40:41 PM,1733274157482,05:02:37 PM
1,5,E-H,1733335829002,10:10:29 AM,1733336914776,10:28:34 AM
2,6,E-H,1733369191361,07:26:31 PM,1733370114638,07:41:54 PM
3,7,E-H,1733427101020,11:31:41 AM,1733427840708,11:44:00 AM
4,8,E-H,1733435613872,01:53:33 PM,1733436646782,02:10:46 PM
5,9,E-H,1733442726685,03:52:06 PM,1733443508169,04:05:08 PM
6,10,E-H,1733450534889,06:02:14 PM,1733451588551,06:19:48 PM
7,11,E-H,1733513684093,11:34:44 AM,1733514638755,11:50:38 AM
8,12,E-H,1733520359538,01:25:59 PM,1733521431754,01:43:51 PM
9,13,E-H,1733779736982,01:28:56 PM,1733781073367,01:51:13 PM


When doing my studies, I've also written down times of when participants start each assessment. After doing some validation, written times and times in EmbracePlus tags are correct ✅

# Assessment End Times

In Google forms, when a participant submits an assessment, the 'Timestamp' column is already in PST

In [None]:
df_assessment_1_end_time = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/Google Forms Sheets/Participants Sheet - Easy Assessment.csv')[['Participant ID ', 'Timestamp']][3:]
df_assessment_2_end_time = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/Google Forms Sheets/Participants Sheet - Hard Assessment.csv')[['Participant ID ', 'Timestamp']][3:]


In [None]:
df_assessment_1_end_time = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/Google Forms Sheets/Participants Sheet - Easy Assessment.csv')[['Participant ID ', 'Timestamp']][3:]
df_assessment_2_end_time = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/Google Forms Sheets/Participants Sheet - Hard Assessment.csv')[['Participant ID ', 'Timestamp']][3:]

# Convert the 'Timestamp' column to datetime and localize the timezone to be timezone aware
pacific_tz = pytz.timezone('US/Pacific')
df_assessment_1_end_time['Timestamp'] = pd.to_datetime(df_assessment_1_end_time['Timestamp'], errors='coerce')
df_assessment_1_end_time['Timestamp'] = df_assessment_1_end_time['Timestamp'].dt.tz_localize(pacific_tz, ambiguous='NaT', nonexistent='NaT')
df_assessment_2_end_time['Timestamp'] = pd.to_datetime(df_assessment_2_end_time['Timestamp'], errors='coerce')
df_assessment_2_end_time['Timestamp'] = df_assessment_2_end_time['Timestamp'].dt.tz_localize(pacific_tz, ambiguous='NaT', nonexistent='NaT')

# Convert to localized PST to UTC timezone
df_assessment_1_end_time['Timestamp_UTC'] = df_assessment_1_end_time['Timestamp'].dt.tz_convert('UTC')
df_assessment_2_end_time['Timestamp_UTC'] = df_assessment_2_end_time['Timestamp'].dt.tz_convert('UTC')

# Convert to UNIX time in milliseconds (Again, in UTC)
df_assessment_1_end_time['easy_milliseconds_end'] = df_assessment_1_end_time['Timestamp_UTC'].astype('int64') // 10**6
df_assessment_2_end_time['hard_milliseconds_end'] = df_assessment_2_end_time['Timestamp_UTC'].astype('int64') // 10**6

# Add a new column with non-military time format (HH:MM:SS AM/PM) from the original Timestamp
df_assessment_1_end_time['easy_pst_dt_end'] = df_assessment_1_end_time['Timestamp'].dt.strftime('%I:%M:%S %p')
df_assessment_2_end_time['hard_pst_dt_end'] = df_assessment_2_end_time['Timestamp'].dt.strftime('%I:%M:%S %p')

# Drop unecessary columns
df_assessment_1_end_time.drop(['Timestamp', 'Timestamp_UTC'], axis=1, inplace=True)
df_assessment_2_end_time.drop(['Timestamp', 'Timestamp_UTC'], axis=1, inplace=True)

# Merging both assessment times together
df_end_times = pd.merge(df_assessment_1_end_time, df_assessment_2_end_time, on='Participant ID ', how='inner')

# Do some last minute formattting
df_end_times.rename(columns={'Participant ID ': 'participant_id'}, inplace=True)
df_end_times['participant_id'] = df_end_times['participant_id'].astype('int64') # Convert from float64 to int64
df_end_times = df_end_times[['participant_id', 'easy_pst_dt_end', 'hard_pst_dt_end', 'easy_milliseconds_end', 'hard_milliseconds_end']] # Rearrange columns


df_end_times.sort_values('participant_id')

Unnamed: 0,participant_id,easy_pst_dt_end,hard_pst_dt_end,easy_milliseconds_end,hard_milliseconds_end
0,4,04:51:56 PM,05:29:16 PM,1733273516000,1733275756000
1,5,10:18:56 AM,10:55:56 AM,1733336336000,1733338556000
2,6,07:33:34 PM,08:00:45 PM,1733369614000,1733371245000
3,7,11:38:15 AM,11:59:47 AM,1733427495000,1733428787000
4,8,02:01:52 PM,02:40:12 PM,1733436112000,1733438412000
5,9,03:58:45 PM,04:30:51 PM,1733443125000,1733445051000
6,10,06:11:34 PM,06:39:17 PM,1733451094000,1733452757000
7,11,11:41:05 AM,12:13:16 PM,1733514065000,1733515996000
8,12,01:35:15 PM,02:05:09 PM,1733520915000,1733522709000
9,13,01:43:02 PM,02:27:11 PM,1733780582000,1733783231000


# Merging start and end times

In [None]:
df_start_end_times = pd.merge(df_start_times, df_end_times, on='participant_id', how='inner')
df_start_end_times = df_start_end_times.sort_values('participant_id')
df_start_end_times

Unnamed: 0,participant_id,group,easy_milliseconds_start,easy_pst_dt_start,hard_milliseconds_start,hard_pst_dt_start,easy_pst_dt_end,hard_pst_dt_end,easy_milliseconds_end,hard_milliseconds_end
0,4,E-H,1733272841738,04:40:41 PM,1733274157482,05:02:37 PM,04:51:56 PM,05:29:16 PM,1733273516000,1733275756000
1,5,E-H,1733335829002,10:10:29 AM,1733336914776,10:28:34 AM,10:18:56 AM,10:55:56 AM,1733336336000,1733338556000
2,6,E-H,1733369191361,07:26:31 PM,1733370114638,07:41:54 PM,07:33:34 PM,08:00:45 PM,1733369614000,1733371245000
3,7,E-H,1733427101020,11:31:41 AM,1733427840708,11:44:00 AM,11:38:15 AM,11:59:47 AM,1733427495000,1733428787000
4,8,E-H,1733435613872,01:53:33 PM,1733436646782,02:10:46 PM,02:01:52 PM,02:40:12 PM,1733436112000,1733438412000
5,9,E-H,1733442726685,03:52:06 PM,1733443508169,04:05:08 PM,03:58:45 PM,04:30:51 PM,1733443125000,1733445051000
6,10,E-H,1733450534889,06:02:14 PM,1733451588551,06:19:48 PM,06:11:34 PM,06:39:17 PM,1733451094000,1733452757000
7,11,E-H,1733513684093,11:34:44 AM,1733514638755,11:50:38 AM,11:41:05 AM,12:13:16 PM,1733514065000,1733515996000
8,12,E-H,1733520359538,01:25:59 PM,1733521431754,01:43:51 PM,01:35:15 PM,02:05:09 PM,1733520915000,1733522709000
9,13,E-H,1733779736982,01:28:56 PM,1733781073367,01:51:13 PM,01:43:02 PM,02:27:11 PM,1733780582000,1733783231000


In [None]:
# df_start_end_times.to_csv(os.path.join('/content/drive/MyDrive/CECS 698 - Data Analysis', 'df_start_end_times.csv'))

Elapsed time for each assessment do seem to make sense ✅

#### NOTE: Formatted timestamps are PST, while ms timestamps are UNIX

In [None]:
CECS_698_PATH = '/content/drive/MyDrive/CECS 698 - Data Analysis/'
PROGRAMMING_QUESTIONS_GP_EP_M2V24 = os.path.join(CECS_698_PATH, 'Programming_Questions_GP_EP_m2v24')

class BaseModifier():
    def __init__(self, participant, device):
        self.participant = participant
        self.source_data_path = os.path.join(PROGRAMMING_QUESTIONS_GP_EP_M2V24, f'Participant {self.participant}', device)
        self.trimmed_folder_name = f'{device} Trimmed'
        self.mkdir_trimmed_data(self.trimmed_folder_name)
        self.display_participant()

    def display_participant(self):
        easy_start = df_start_end_times[df_start_end_times['participant_id'] == self.participant]['easy_pst_dt_start'].values[0]
        easy_end = df_start_end_times[df_start_end_times['participant_id'] == self.participant]['easy_pst_dt_end'].values[0]
        hard_start = df_start_end_times[df_start_end_times['participant_id'] == self.participant]['hard_pst_dt_start'].values[0]
        hard_end = df_start_end_times[df_start_end_times['participant_id'] == self.participant]['hard_pst_dt_end'].values[0]

        print(f'Participant {self.participant}:', end=' ')
        print(f'\033[94mEXPECTED [[Easy: {easy_start} - {easy_end} | Hard: {hard_start} - {hard_end}]]\033[0m')


    def mkdir_trimmed_data(self, sub_folder):
        # Creating new folder for trimmed data, if it does not exist already
        path = os.path.join(CECS_698_PATH, sub_folder)

        # Check if the folder already exists
        if not os.path.exists(path):
            os.makedirs(path)


    def save_data(self, df_easy, df_hard, data_name):
        # Creating subdirectory for individual participants
        new_participant_path = os.path.join(CECS_698_PATH, self.trimmed_folder_name, f'Participant {self.participant}')
        if not os.path.exists(new_participant_path):
            os.makedirs(new_participant_path)

        df_easy.to_csv(os.path.join(new_participant_path, f'{data_name}_easy.csv'), index=False)
        df_hard.to_csv(os.path.join(new_participant_path, f'{data_name}_hard.csv'), index=False)

    def read_files(self):
        self.source_data_path

        if os.path.isdir(self.source_data_path): # Loop through each participant given self.path
            for csv in os.listdir(self.source_data_path): # Loop through each .csv file per participant
                csv_path = os.path.join(self.path, csv)

# Data Trimming: EmbracePlus ⌚

Initially, there was an issue with participant 9, as EmbracePlus generated their data across December 5th and December 6th (Hence 2 .csv files per biomarker). The 2 cells below is used to concatenate their data together, so this only needs to be ran **ONCE**)

In [None]:
# # Concatentating files
# participant_9_path = '/content/drive/MyDrive/CECS 698 - Data Analysis/Programming_Questions_GP_EP_m2v24/Participant 9/EmbracePlus'

# df_wearing_detection_05 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_wearing-detection.csv'))
# df_wearing_detection_06 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-06_wearing-detection.csv'))
# df_wearing_detection = pd.concat([df_wearing_detection_05, df_wearing_detection_06], ignore_index=True)

# df_temperature_05 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_temperature.csv'))
# df_temperature_06 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-06_temperature.csv'))
# df_temperature = pd.concat([df_temperature_05, df_temperature_06], ignore_index=True)

# df_respiratory_rate_05 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_respiratory-rate.csv'))
# df_respiratory_rate_06 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-06_respiratory-rate.csv'))
# df_respiratory_rate = pd.concat([df_respiratory_rate_05, df_respiratory_rate_06], ignore_index=True)

# df_pulse_rate_05 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_pulse-rate.csv'))
# df_pulse_rate_06 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-06_pulse-rate.csv'))
# df_pulse_rate = pd.concat([df_pulse_rate_05, df_pulse_rate_06], ignore_index=True)

# df_eda_05 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_eda.csv'))
# df_eda_06 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-06_eda.csv'))
# df_eda = pd.concat([df_eda_05, df_eda_06], ignore_index=True)

# df_prv_05 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_prv.csv'))
# df_prv_06 = pd.read_csv(os.path.join(participant_9_path, '1-1-9_2024-12-06_prv.csv'))
# df_prv = pd.concat([df_prv_05, df_prv_06], ignore_index=True)

In [None]:
# # Clearing all original files from participant 9's EmbracePlus
# old_files = os.listdir(participant_9_path)
# if len(old_files) > 0:
#     for f in old_files:
#         remove = os.path.join(participant_9_path, f)
#         os.remove(remove)

# # Reading all concattenated dataframes to .csv files
# df_wearing_detection.to_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_wearing-detection.csv'), index=False)
# df_temperature.to_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_temperature.csv'), index=False)
# df_respiratory_rate.to_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_respiratory-rate.csv'), index=False)
# df_pulse_rate.to_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_pulse-rate.csv'), index=False)
# df_eda.to_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_eda.csv'), index=False)
# df_prv.to_csv(os.path.join(participant_9_path, '1-1-9_2024-12-05_prv.csv'), index=False)

In [None]:
# prv.csv
# eda.csv
# temperature.csv
# pulse-rate.csv
# respiratory-rate.csv
# wearing-detection.csv


class EmbracePlusModifier(BaseModifier):
    def __init__(self, participant):
        super().__init__(participant, 'EmbracePlus')
        self.all_data = self.read_files()

    def read_files(self):
        all_data = dict()
        if os.path.isdir(self.source_data_path): # Loop through each participant given self.path
            for csv in os.listdir(self.source_data_path): # Loop through each .csv file per participant
                biomarker_path = os.path.join(self.source_data_path, csv)
                # print(f"\t Reading {biomarker_path}")
                biomarker_name = csv.split('_')[-1].split('.')[0]
                all_data[biomarker_name] = pd.read_csv(biomarker_path)

        return all_data

    def format_participant(self, df):
        # Originally, particiapnt_full_id is in the format 1504-1-1-N (Where N is partipant ID). We want to just get participant ID
        df['participant_full_id'] = df['participant_full_id'].apply(lambda x: x.split('-')[-1]).astype('int64')
        df.rename(columns={'participant_full_id': 'participant_id'}, inplace=True)
        return df

    def format_timestamps(self, df):
        # Convert to UNIX -> PST
        pacific_timezone = pytz.timezone('US/Pacific')
        df['timestamp_pst'] = df['timestamp_unix'].apply(lambda x: datetime.fromtimestamp(x / 1000, pacific_timezone).strftime('%I:%M:%S %p'))
        df.drop('timestamp_iso', axis=1, inplace=True)
        return df

    def trim(self, df):
        filter = df_start_end_times['participant_id'] == self.participant

        # Trimming dataset to get biomarkers during easy assessment
        easy_pst_dt_start = df_start_end_times[filter]['easy_milliseconds_start'].values[0]
        easy_pst_dt_end = df_start_end_times[filter]['easy_milliseconds_end'].values[0]
        df_easy_biomarkers = df[(df['timestamp_unix'] >= easy_pst_dt_start) & (df['timestamp_unix'] <= easy_pst_dt_end)]

        # Trimming data set to get biomarkers during hard assessment
        hard_pst_dt_start = df_start_end_times[filter]['hard_milliseconds_start'].values[0]
        hard_pst_dt_end = df_start_end_times[filter]['hard_milliseconds_end'].values[0]
        df_hard_biomarkers = df[(df['timestamp_unix'] >= hard_pst_dt_start) & (df['timestamp_unix'] <= hard_pst_dt_end)]

        try:
            print(f"Easy: {df_easy_biomarkers['timestamp_pst'].iloc[0]} - {df_easy_biomarkers['timestamp_pst'].iloc[-1]}", end=' | ')
            print(f"Hard: {df_hard_biomarkers['timestamp_pst'].iloc[0]} - {df_hard_biomarkers['timestamp_pst'].iloc[-1]}")
        except Exception as e:
            print(f"\x1b[31m\"[[ERROR ⚠]]: {e}\"\x1b[0m")

        return df_easy_biomarkers, df_hard_biomarkers

    def modify_all(self, biomarker):
        print(f'\t Modifying {biomarker}...', end=' ')

        for key, value in self.all_data.items():
            if key == biomarker:
                df = value

        self.format_participant(df)
        self.format_timestamps(df)
        df_easy, df_hard = self.trim(df)
        self.save_data(df_easy, df_hard, biomarker)
        return df_easy, df_hard


In [None]:
participant_folders = [name for name in os.listdir(PROGRAMMING_QUESTIONS_GP_EP_M2V24) if os.path.isdir(os.path.join(PROGRAMMING_QUESTIONS_GP_EP_M2V24, name))]
all_participants = sorted([int(f.split(' ')[-1]) for f in participant_folders if 'Participant' in f])

# Print the list of folders
for i, participant in enumerate(all_participants): # Loop through each participant folder
      print(f"{i + 1}) ", end='')
      EP_mod = EmbracePlusModifier(participant)
      df_prv_easy, df_prv_hard = EP_mod.modify_all('prv')
      df_eda_easy, df_eda_hard = EP_mod.modify_all('eda')
      df_temperature_easy, df_temperature_hard = EP_mod.modify_all('temperature')
      df_pulse_rate_easy, df_pulse_rate_hard = EP_mod.modify_all('pulse-rate')
      df_respiratory_rate_easy, df_respiratory_rate_hard = EP_mod.modify_all('respiratory-rate')

1) Participant 4: [94mEXPECTED [[Easy: 04:40:41 PM - 04:51:56 PM | Hard: 05:02:37 PM - 05:29:16 PM]][0m
	 Modifying prv... Easy: 04:41:00 PM - 04:51:00 PM | Hard: 05:03:00 PM - 05:29:00 PM
	 Modifying eda... Easy: 04:41:00 PM - 04:51:00 PM | Hard: 05:03:00 PM - 05:29:00 PM
	 Modifying temperature... Easy: 04:41:00 PM - 04:51:00 PM | Hard: 05:03:00 PM - 05:29:00 PM
	 Modifying pulse-rate... Easy: 04:41:00 PM - 04:51:00 PM | Hard: 05:03:00 PM - 05:29:00 PM
	 Modifying respiratory-rate... Easy: 04:41:00 PM - 04:51:00 PM | Hard: 05:03:00 PM - 05:29:00 PM
2) Participant 5: [94mEXPECTED [[Easy: 10:10:29 AM - 10:18:56 AM | Hard: 10:28:34 AM - 10:55:56 AM]][0m
	 Modifying prv... Easy: 10:11:00 AM - 10:18:00 AM | Hard: 10:29:00 AM - 10:55:00 AM
	 Modifying eda... Easy: 10:11:00 AM - 10:18:00 AM | Hard: 10:29:00 AM - 10:55:00 AM
	 Modifying temperature... Easy: 10:11:00 AM - 10:18:00 AM | Hard: 10:29:00 AM - 10:55:00 AM
	 Modifying pulse-rate... Easy: 10:11:00 AM - 10:18:00 AM | Hard: 10:29:

# Data Trimming: GazePoint 👁

In [None]:
CECS_698_PATH = '/content/drive/MyDrive/CECS 698 - Data Analysis/'
PROGRAMMING_QUESTIONS_GP_EP_M2V24 = os.path.join(CECS_698_PATH, 'Programming_Questions_GP_EP_m2v24')

class GazePointModifier(BaseModifier):
    def __init__(self, participant):
        super().__init__(participant, 'Gazepoint')
        self.all_data = self.read_files()

    def read_files(self):
        all_data = dict()

        if os.path.isdir(self.source_data_path): # Loop through each participant given self.path
            for csv in os.listdir(self.source_data_path): # Loop through each .csv file per participant
                eyetrack_path = os.path.join(self.source_data_path, csv)

                # CSV File formats:
                #   'Participant # Assessment #_all_gaze.csv'
                #   'Participant # Assessment #_fixations.csv'
                # --> Key Formats in all_data: #_all_gaze/#_fixations
                all_data[csv.split()[-1].split('.')[0]] = pd.read_csv(eyetrack_path)

        return all_data



    def format_timestamps(self, df):
        time_col = df.columns[3]
        start_time = time_col.replace("TIME(", "").replace(")", "")

        pst = pytz.timezone("US/Pacific")
        start_time = datetime.strptime(start_time, "%Y/%m/%d %H:%M:%S.%f") # Parse String to DateTime
        start_time = pst.localize(start_time) # Localize to PST to make it timezone aware
        start_time = start_time.astimezone(pytz.UTC) # Convert PST to UTC
        base_time_unix_ms = int(start_time.timestamp() * 1000) # Convert to UTC milliseconds

        # Convert the TIME column to UNIX timestamp in milliseconds UTC
        df['timestamp_unix'] = df[time_col].apply(lambda elapsed_seconds: base_time_unix_ms + int(float(elapsed_seconds) * 1000))
        df['timestamp_pst'] = pd.to_datetime(df['timestamp_unix'], unit='ms', utc=True).dt.tz_convert(pst).dt.strftime('%I:%M:%S %p')

        # df.drop([df.columns[3], df.columns[4]], axis=1, inplace=True) # Drop TIME(...) and TIMETICK columns as they are no longer needed

        return df

    def trim(self, df, assessment):
        filter = df_start_end_times['participant_id'] == self.participant

        if assessment == 1:
            # Trimming dataset to get biomarkers during easy assessment
            easy_pst_dt_start = df_start_end_times[filter]['easy_milliseconds_start'].values[0]
            easy_pst_dt_end = df_start_end_times[filter]['easy_milliseconds_end'].values[0]
            df = df[(df['timestamp_unix'] >= easy_pst_dt_start) & (df['timestamp_unix'] <= easy_pst_dt_end)]
        elif assessment == 2:
            # Trimming data set to get biomarkers during hard assessment
            hard_pst_dt_start = df_start_end_times[filter]['hard_milliseconds_start'].values[0]
            hard_pst_dt_end = df_start_end_times[filter]['hard_milliseconds_end'].values[0]
            df = df[(df['timestamp_unix'] >= hard_pst_dt_start) & (df['timestamp_unix'] <= hard_pst_dt_end)]

        return df

    def modify_all(self, eye_data):
        print(f'\t Modifying {eye_data}...', end=' ')

        # Keys: 1_all_gaze, 1_fixations, 2_all_gaze, 2_fixations (Choose the ones with EITHER all_gaze or fixations)
        postfix = [key for key in self.all_data.keys() if eye_data in key]

        # Now extracting Assessment 1 and 2 for df_easy and df_hard, respectively
        easy_postfix = [key for key in postfix if '1' in key][0]
        hard_postfix = [key for key in postfix if '2' in key][0]

        # Getting files for Assessments 1 and 2, given postfix (all_gaze or fixations)
        df_easy, df_hard = self.all_data[easy_postfix], self.all_data[hard_postfix]

        df_easy = self.format_timestamps(df_easy)
        df_easy = self.trim(df_easy, 1)

        df_hard = self.format_timestamps(df_hard)
        df_hard = self.trim(df_hard, 2)

        try:
            print(f"Easy: {df_easy['timestamp_pst'].iloc[0]} - {df_easy['timestamp_pst'].iloc[-1]}", end=' | ')
            print(f"Hard: {df_hard['timestamp_pst'].iloc[0]} - {df_hard['timestamp_pst'].iloc[-1]}")
        except Exception as e:
            print(f"\x1b[31m\"[[ERROR ⚠]]: {e}\"\x1b[0m")

        self.save_data(df_easy, df_hard, eye_data)

        return df_easy, df_hard

In [None]:
participant_folders = [name for name in os.listdir(PROGRAMMING_QUESTIONS_GP_EP_M2V24) if os.path.isdir(os.path.join(PROGRAMMING_QUESTIONS_GP_EP_M2V24, name))]
all_participants = sorted([int(f.split(' ')[-1]) for f in participant_folders if 'Participant' in f])[0:]

# Print the list of folders
for i, participant in enumerate(all_participants): # Loop through each participant folder
      print(f"{i + 1}) ", end='')
      GP_mod = GazePointModifier(participant)
      GP_mod.modify_all('all_gaze')
      GP_mod.modify_all('fixations')

1) Participant 4: [94mEXPECTED [[Easy: 04:40:41 PM - 04:51:56 PM | Hard: 05:02:37 PM - 05:29:16 PM]][0m
	 Modifying all_gaze... Easy: 04:40:41 PM - 04:51:55 PM | Hard: 05:02:37 PM - 05:29:15 PM
	 Modifying fixations... Easy: 04:40:41 PM - 04:51:55 PM | Hard: 05:02:37 PM - 05:29:15 PM
2) Participant 5: [94mEXPECTED [[Easy: 10:10:29 AM - 10:18:56 AM | Hard: 10:28:34 AM - 10:55:56 AM]][0m
	 Modifying all_gaze... Easy: 10:10:29 AM - 10:18:55 AM | Hard: 10:28:34 AM - 10:55:56 AM
	 Modifying fixations... Easy: 10:10:29 AM - 10:18:55 AM | Hard: 10:28:35 AM - 10:55:55 AM
3) Participant 6: [94mEXPECTED [[Easy: 07:26:31 PM - 07:33:34 PM | Hard: 07:41:54 PM - 08:00:45 PM]][0m
	 Modifying all_gaze... Easy: 07:26:31 PM - 07:33:34 PM | Hard: 07:41:54 PM - 08:00:44 PM
	 Modifying fixations... Easy: 07:26:32 PM - 07:33:33 PM | Hard: 07:41:55 PM - 08:00:44 PM
4) Participant 7: [94mEXPECTED [[Easy: 11:31:41 AM - 11:38:15 AM | Hard: 11:44:00 AM - 11:59:47 AM]][0m
	 Modifying all_gaze... Easy: 11: