In [None]:
import pandas as pd
import os
import numpy as np
from datetime import datetime, timedelta
import pytz

from sklearn.impute import SimpleImputer

Global variables

In [None]:
CECS_698_PATH = '/content/drive/MyDrive/CECS 698 - Data Analysis/'

EMBRACEPLUS_TRIMMED_PATH = os.path.join(CECS_698_PATH, 'EmbracePlus Trimmed')
EMBRACEPLUS_COMBINED_PATH = os.path.join(CECS_698_PATH, 'EmbracePlus Combined')

GAZEPOINT_TRIMMED_PATH = os.path.join(CECS_698_PATH, 'Gazepoint Trimmed')

PARTICIPANTS = [i for i in range(4, 27)]

GAZEPOINT_TUMBLING_WINDOW_DGM = os.path.join(CECS_698_PATH, 'Gazepoint Tumbling Window DGM')
GAZEPOINT_DGM_TIMESTAMP_LABELED = os.path.join(CECS_698_PATH, 'Gazepoint DGM Timestamp Labeled')

MERGED_DATA = os.path.join(CECS_698_PATH, 'Merged Data')

# Particpants ids to exclude on specific assessments
EASY_ASSESSMENT_EXCLUDE = [11, 22]
HARD_ASSESSMENT_EXCLUDE = [15, 21]

# EmbracePlus ⌚

## Combining Datasets

In [None]:
# Create a new directory to store combined data: '/content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined'
if not os.path.exists(EMBRACEPLUS_COMBINED_PATH):
    os.mkdir(EMBRACEPLUS_COMBINED_PATH)

if os.path.exists(EMBRACEPLUS_TRIMMED_PATH):
    for p in PARTICIPANTS:
        print(f"Participant {p}")

        source = os.path.join(EMBRACEPLUS_TRIMMED_PATH, f'Participant {p}')
        dest = os.path.join(EMBRACEPLUS_COMBINED_PATH, f'Participant {p}') # Create subdirectories within new '/EmbracePlus Combined' directory
        if not os.path.exists(dest):
            os.mkdir(dest)


        ################################################ Easy Assessments ################################################

        df_eda_easy = pd.read_csv(os.path.join(source, 'eda_easy.csv'))[['timestamp_unix', 'eda_scl_usiemens']]
        # df_prv_easy = pd.read_csv(os.path.join(source, 'prv_easy.csv'))[['prv_rmssd_ms']]
        df_pulse_rate_easy = pd.read_csv(os.path.join(source, 'pulse-rate_easy.csv'))[['pulse_rate_bpm']]
        # df_respiratory_rate_easy = pd.read_csv(os.path.join(source, 'respiratory-rate_easy.csv'))[['respiratory_rate_brpm']]
        df_temperature_easy = pd.read_csv(os.path.join(source, 'temperature_easy.csv'))[['temperature_celsius']]


        df_easy_combined = pd.concat([df_eda_easy, df_pulse_rate_easy, df_temperature_easy], axis=1)
        # df_easy_combined = pd.concat([df_eda_easy, df_prv_easy, df_pulse_rate_easy, df_respiratory_rate_easy, df_temperature_easy], axis=1)
        df_easy_combined.to_csv(os.path.join(dest, 'EmbracePlus_easy_combined.csv'), index=False)
        print(f"\tSuccessfully saved combined easy data into {dest}")

        ################################################ Hard Assessments ################################################

        df_eda_hard = pd.read_csv(os.path.join(source, 'eda_hard.csv'))[['timestamp_unix', 'eda_scl_usiemens']]
        # df_prv_hard = pd.read_csv(os.path.join(source, 'prv_hard.csv'))[['prv_rmssd_ms']]
        df_pulse_rate_hard = pd.read_csv(os.path.join(source, 'pulse-rate_hard.csv'))[['pulse_rate_bpm']]
        # df_respiratory_rate_hard = pd.read_csv(os.path.join(source, 'respiratory-rate_hard.csv'))[['respiratory_rate_brpm']]
        df_temperature_hard = pd.read_csv(os.path.join(source, 'temperature_hard.csv'))[['temperature_celsius']]

        df_hard_combined = pd.concat([df_eda_hard, df_pulse_rate_hard, df_temperature_hard], axis=1)
        # df_hard_combined = pd.concat([df_eda_hard, df_prv_hard, df_pulse_rate_hard, df_respiratory_rate_hard, df_temperature_hard], axis=1)
        df_hard_combined.to_csv(os.path.join(dest, 'EmbracePlus_hard_combined.csv'), index=False)
        print(f"\tSuccessfully saved combined hard data into {dest}\n")

else:
    print(f"{EMBRACEPLUS_TRIMMED_PATH} does not exist.")

Participant 4
	Successfully saved combined easy data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 4
	Successfully saved combined hard data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 4

Participant 5
	Successfully saved combined easy data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 5
	Successfully saved combined hard data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 5

Participant 6
	Successfully saved combined easy data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 6
	Successfully saved combined hard data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 6

Participant 7
	Successfully saved combined easy data into /content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 7
	Successfully saved combined hard data into /content

## Viewing Null Values

In [None]:
for participant in os.listdir(EMBRACEPLUS_COMBINED_PATH):
    print(f"{participant}:")
    for file in os.listdir(os.path.join(EMBRACEPLUS_COMBINED_PATH, participant)):
        print(f"\t{file}", end=" ")
        df = pd.read_csv(os.path.join(EMBRACEPLUS_COMBINED_PATH, participant, file))
        total_rows = len(df)

        # Finding columns with null values and their null count
        columns_with_nulls = df.isnull().sum()
        columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]  # Filter columns with at least one null

        # Formatting the output
        output = ", ".join([f"{col} ({null_count}/{total_rows})" for col, null_count in columns_with_nulls.items()])

        # Printing the result
        print(f"Columns with null values: {output}")

Participant 4:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 5:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 6:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 7:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 8:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 9:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 10:
	EmbracePlus_hard_combined.csv Columns with null values: 
	EmbracePlus_easy_combined.csv Columns with null values: 
Participant 11:
	EmbracePlus_easy_combined.csv Columns with null values: 
	Embrace

So far, participants 20 (Hard) and 23 (Easy) have null values. We will impute these null values using scikitlearn's `SimpleImputer`

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 20/EmbracePlus_hard_combined.csv')

imputer = SimpleImputer(strategy='median')
imputed_data = imputer.fit_transform(df)
df = pd.DataFrame(imputed_data, columns=df.columns)

df.to_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 20/EmbracePlus_hard_combined.csv', index=False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 23/EmbracePlus_easy_combined.csv')

imputer = SimpleImputer(strategy='median')
imputed_data = imputer.fit_transform(df)
df = pd.DataFrame(imputed_data, columns=df.columns)

df.to_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/EmbracePlus Combined/Participant 23/EmbracePlus_easy_combined.csv', index=False)

In [None]:
for participant in os.listdir(EMBRACEPLUS_COMBINED_PATH):
    print(f"{participant}:")
    for file in os.listdir(os.path.join(EMBRACEPLUS_COMBINED_PATH, participant)):
        print(f"\t{file}", end=" ")
        df = pd.read_csv(os.path.join(EMBRACEPLUS_COMBINED_PATH, participant, file))
        total_rows = len(df)

        # Finding columns with null values and their null count
        columns_with_nulls = df.isnull().sum()
        columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]  # Filter columns with at least one null

        # Formatting the output
        output = ", ".join([f"{col} ({null_count}/{total_rows})" for col, null_count in columns_with_nulls.items()])

        # Printing the result
        print(f"Columns with null values: {output}")

Participant 4:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 5:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 6:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 7:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 8:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 9:
	EmbracePlus_easy_combined.csv Columns with null values: 
	EmbracePlus_hard_combined.csv Columns with null values: 
Participant 10:
	EmbracePlus_hard_combined.csv Columns with null values: 
	EmbracePlus_easy_combined.csv Columns with null values: 
Participant 11:
	EmbracePlus_easy_combined.csv Columns with null values: 
	Embrace

# Gazepoint 👁

## Labeling timestamps on all_window_DGMs file

I've taken the currently trimmed gaze data from `/Gazepoint Trimmed` directory and manually put them all into [BEACH Gaze](https://github.com/TheD2Lab/BEACH-Gaze) to get Descriptive Gaze Measures (DGMs) using the Tumbling Window approach (Window size 10 seconds). All of the data generataed by BEACH Gaze are put in the `/Gazepoint Tumbling Window DGM` directory containing subdirectories per participant, per assessment.
 - (Ex:  `/Gazepoint Tumbling Window DGM/Participant 10/results/all_gaze_easy` and `/Gazepoint Tumbling Window DGM/Participant 10/results/all_gaze_hard`

This script below is used add timestamps to each `all_window_DGMs.csv` file per participant.

⚠ **WARNING: THIS WILL TAKE A WHILE TO RUN. RUN IT ONCE IF THESE FILES CREATED ALREADY EXIST**

In [None]:
all_participants = os.listdir(GAZEPOINT_TUMBLING_WINDOW_DGM)
all_participants = sorted(all_participants, key=lambda x: int(x.replace("Participant ", "")))


# File path created to save modified data
if not os.path.exists(GAZEPOINT_DGM_TIMESTAMP_LABELED):
    os.mkdir(GAZEPOINT_DGM_TIMESTAMP_LABELED)

# -------------------- Labeling DGM files with timestamp_pst and timestamp_unix --------------------
for p in PARTICIPANTS:
    print(f"Participant {p}")

    # Creating subdirectory per participant
    if not os.path.exists(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, f'Participant {p}')):
        os.mkdir(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, f'Participant {p}'))

    results_dir = os.path.join(GAZEPOINT_TUMBLING_WINDOW_DGM, f'Participant {p}', 'results')
    for all_gaze_dir in os.listdir(results_dir):
        window = []
        timestamp_unix = []
        timestamp_pst = []

        tumbling_dir = os.path.join(results_dir, all_gaze_dir, 'tumbling')
        print(f"\t{tumbling_dir}")

        # Dataframe contains combination of ALL windows together
        all_window_DGMs_df = pd.read_csv(os.path.join(tumbling_dir, 'all_window_DGMs.csv'))

        windows_dirs = os.listdir(tumbling_dir)
        windows_dirs.remove('all_window_DGMs.csv')
        sorted_windows_dirs = sorted(windows_dirs, key=lambda x: int(x.replace("window", ""))) # window1, window2, window3, etc...



        # Getting the last timestamp (UNIX and PST) from each window, and adding that to current all_window_DGMs file

        for i, w in enumerate(sorted_windows_dirs):
            window_df = pd.read_csv(os.path.join(tumbling_dir, w, f'window{i + 1}.csv'))

            window.append(sorted_windows_dirs[i])
            timestamp_unix.append(window_df.iloc[-1]['timestamp_unix'])
            timestamp_pst.append(window_df.iloc[-1]['timestamp_pst'])


            added_columns = pd.DataFrame({
                'window': window,
                'timestamp_unix': timestamp_unix,
                'timestamp_pst': timestamp_pst
            })

        all_window_DGMs_df_mod = pd.concat([all_window_DGMs_df, added_columns], axis=1)
        # print(f"\t\t{len(window)} {len(timestamp_unix)} {len(timestamp_pst)} {len(all_window_DGMs_df)}")

        # Import all_window_DGMs_df_mod into '/Gazepoint DGM Timestamp Labeled'
        if tumbling_dir.split('/')[-2] == 'all_gaze_hard':
            all_window_DGMs_df_mod.to_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, f'Participant {p}', 'all_gaze_hard.csv'), index=False)
            print('\t\tSuccessfully saved all_gaze_hard.csv', end=' ')

        elif tumbling_dir.split('/')[-2] == 'all_gaze_easy':
            all_window_DGMs_df_mod.to_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, f'Participant {p}', 'all_gaze_easy.csv'), index=False)
            print('\t\tSuccessfully saved all_gaze_easy.csv', end=' ')

        print(f"\033[94m [[{all_window_DGMs_df_mod.iloc[0]['timestamp_pst']} - {all_window_DGMs_df_mod.iloc[-1]['timestamp_pst']}]] \033[0m")

Participant 4
	/content/drive/MyDrive/CECS 698 - Data Analysis/Gazepoint Tumbling Window DGM/Participant 4/results/all_gaze_hard/tumbling
		Successfully saved all_gaze_hard.csv [94m [[05:02:47 PM - 05:29:15 PM]] [0m
	/content/drive/MyDrive/CECS 698 - Data Analysis/Gazepoint Tumbling Window DGM/Participant 4/results/all_gaze_easy/tumbling
		Successfully saved all_gaze_easy.csv [94m [[04:40:51 PM - 04:51:55 PM]] [0m
Participant 5
	/content/drive/MyDrive/CECS 698 - Data Analysis/Gazepoint Tumbling Window DGM/Participant 5/results/all_gaze_hard/tumbling
		Successfully saved all_gaze_hard.csv [94m [[10:28:44 AM - 10:55:56 AM]] [0m
	/content/drive/MyDrive/CECS 698 - Data Analysis/Gazepoint Tumbling Window DGM/Participant 5/results/all_gaze_easy/tumbling
		Successfully saved all_gaze_easy.csv [94m [[10:10:38 AM - 10:18:55 AM]] [0m
Participant 6
	/content/drive/MyDrive/CECS 698 - Data Analysis/Gazepoint Tumbling Window DGM/Participant 6/results/all_gaze_easy/tumbling
		Successfully sav

## Getting rid of irrelevant columns

In [None]:
deleted_columns = [
    'total_number_of_l_mouse_clicks',
    'beginning_timestamp',
    'ending_timestamp',
    'window_duration',
    'initial_seconds_elapsed_since_start',
    'final_seconds_elapsed_since_start',
    'timestamp_pst'
]

for participant_dir in os.listdir(GAZEPOINT_DGM_TIMESTAMP_LABELED):
    for file in os.listdir(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir)):
        df = pd.read_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir, file))
        df.drop(deleted_columns, axis=1, inplace=True)

        # Override current files with new files after dropping columns
        df.to_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir, file), index=False)

## Getting rid of null values

In [None]:
for participant_dir in os.listdir(GAZEPOINT_DGM_TIMESTAMP_LABELED):
    print(f"{participant_dir}:")
    for file in os.listdir(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir)):
        print(f"\t{file}", end=" ")
        df = pd.read_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir, file))
        total_rows = len(df)

        # Finding columns with null values and their null count
        columns_with_nulls = df.isnull().sum()
        columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]  # Filter columns with at least one null

        # Printing the result
        print(f"{len(columns_with_nulls)} columns with null values")

Participant 4:
	all_gaze_hard.csv 5 columns with null values
	all_gaze_easy.csv 42 columns with null values
Participant 5:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 6:
	all_gaze_hard.csv 1 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 7:
	all_gaze_easy.csv 0 columns with null values
	all_gaze_hard.csv 0 columns with null values
Participant 8:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 1 columns with null values
Participant 9:
	all_gaze_easy.csv 0 columns with null values
	all_gaze_hard.csv 0 columns with null values
Participant 10:
	all_gaze_hard.csv 45 columns with null values
	all_gaze_easy.csv 45 columns with null values
Participant 11:
	all_gaze_easy.csv 42 columns with null values
	all_gaze_hard.csv 42 columns with null values
Participant 12:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 13:
	all_gaze_eas

There are many columns with null values. Looking at the info of all columns. All columns (Except for `window` and `timestamp_pst` are `float64` or `int64`. We can impute null values by their medians

In [None]:
ex_df = pd.read_csv('/content/drive/MyDrive/CECS 698 - Data Analysis/Gazepoint DGM Timestamp Labeled/Participant 10/all_gaze_easy.csv')
ex_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 63 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   total_number_of_fixations         56 non-null     int64  
 1   sum_of_all_fixation_duration_s    56 non-null     float64
 2   mean_fixation_duration_s          54 non-null     float64
 3   median_fixation_duration_s        54 non-null     float64
 4   stdev_of_fixation_durations_s     54 non-null     float64
 5   min_fixation_duration_s           54 non-null     float64
 6   max_fixation_duration_s           54 non-null     float64
 7   total_number_of_saccades          56 non-null     int64  
 8   sum_of_all_saccade_lengths        56 non-null     float64
 9   mean_saccade_length               54 non-null     float64
 10  median_saccade_length             54 non-null     float64
 11  stdev_of_saccade_lengths          54 non-null     float64
 12  min_saccad

In [None]:
# Getting rid of null values
imputer = SimpleImputer(strategy='median')

for participant_dir in os.listdir(GAZEPOINT_DGM_TIMESTAMP_LABELED):
    for file in os.listdir(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir)):
        df = pd.read_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir, file))

        numeric_cols = df.select_dtypes(include=['number']).columns
        df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

        # Override current files with new files after imputing null values
        df.to_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir, file), index=False)

In [None]:
for participant_dir in os.listdir(GAZEPOINT_DGM_TIMESTAMP_LABELED):
    print(f"{participant_dir}:")
    for file in os.listdir(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir)):
        print(f"\t{file}", end=" ")
        df = pd.read_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, participant_dir, file))
        total_rows = len(df)

        # Finding columns with null values and their null count
        columns_with_nulls = df.isnull().sum()
        columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]  # Filter columns with at least one null

        # Printing the result
        print(f"{len(columns_with_nulls)} columns with null values")

Participant 4:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 5:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 6:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 7:
	all_gaze_easy.csv 0 columns with null values
	all_gaze_hard.csv 0 columns with null values
Participant 8:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 9:
	all_gaze_easy.csv 0 columns with null values
	all_gaze_hard.csv 0 columns with null values
Participant 10:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 11:
	all_gaze_easy.csv 0 columns with null values
	all_gaze_hard.csv 0 columns with null values
Participant 12:
	all_gaze_hard.csv 0 columns with null values
	all_gaze_easy.csv 0 columns with null values
Participant 13:
	all_gaze_easy.csv

# Merging both EmbracePlus and Gazepoint Data 🛠

NOTE: Some null values are generated due to the differing start times of the merged datasets, even after interpolation. Specifically, the dataset that starts earlier will have null values until its timestamps align with those of the later-starting dataset. This is why we use `dropna()` as seen below

In [None]:
# File path created to save merged data
if not os.path.exists(MERGED_DATA):
    os.mkdir(MERGED_DATA)

# Merging data for all participants
pacific_timezone = pytz.timezone('US/Pacific')

for p in PARTICIPANTS:
    print(f"Participant {p}")

    participant_dir = os.path.join(MERGED_DATA, f"Participant {p}")
    if not os.path.exists(participant_dir):
        os.mkdir(participant_dir)

    # -------------------- Merging datasets during easy assessments --------------------
    easy_EP_df = pd.read_csv(os.path.join(EMBRACEPLUS_COMBINED_PATH, f'Participant {p}', 'EmbracePlus_easy_combined.csv'))
    easy_GP_df = pd.read_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, f'Participant {p}', 'all_gaze_easy.csv'))
    df_easy_merged = pd.merge(easy_EP_df, easy_GP_df, on="timestamp_unix", how="outer")
    df_easy_merged.drop('window', axis=1, inplace=True) # No longer need this column, as it was used for validation

    numeric_cols = df_easy_merged.select_dtypes(include=['number']).columns
    for n in numeric_cols:
        df_easy_merged[n] = df_easy_merged[n].interpolate()

    df_easy_merged = df_easy_merged.dropna().reset_index(drop=True) # Droping null values

    df_easy_merged.to_csv(os.path.join(participant_dir, f"participant{p}_easy_assessment.csv"), index=False)
    print('\t\tSuccessfully merged easy asessment data')

    # -------------------- Merging datasets during hard assessments --------------------
    hard_EP_df = pd.read_csv(os.path.join(EMBRACEPLUS_COMBINED_PATH, f'Participant {p}', 'EmbracePlus_hard_combined.csv'))
    hard_GP_df = pd.read_csv(os.path.join(GAZEPOINT_DGM_TIMESTAMP_LABELED, f'Participant {p}', 'all_gaze_hard.csv'))
    df_hard_merged = pd.merge(hard_EP_df, hard_GP_df, on="timestamp_unix", how="outer")
    df_hard_merged.drop('window', axis=1, inplace=True) # No longer need this column, as it was used for validation

    numeric_cols = df_hard_merged.select_dtypes(include=['number']).columns
    for n in numeric_cols:
        df_hard_merged[n] = df_hard_merged[n].interpolate()

    # Dropping rows with values (Trimming the start of dataframes if null values exist after merging)
    df_hard_merged = df_hard_merged.dropna().reset_index(drop=True)

    df_hard_merged.to_csv(os.path.join(participant_dir, f"participant{p}_hard_assessment.csv"), index=False)
    print('\t\tSuccessfully merged hard asessment data')

Participant 4
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 5
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 6
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 7
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 8
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 9
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 10
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 11
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 12
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 13
		Successfully merged easy asessment data
		Successfully merged hard asessment data
Participant 14
	

Validating start and end times for all merged data files

In [None]:
for participant in os.listdir(MERGED_DATA):
    print(participant)
    for file in os.listdir(os.path.join(MERGED_DATA, participant)):
        print(f"\t{file}", end=" ")
        df = pd.read_csv(os.path.join(MERGED_DATA, participant, file))

        # Converting UNIX timestamp to Pacific time readable format
        pacific_timezone = pytz.timezone('US/Pacific')
        df['timestamp_pst'] = df['timestamp_unix'].apply(lambda x: datetime.fromtimestamp(x / 1000, pacific_timezone).strftime('%I:%M:%S %p'))

        start, end = df.iloc[0]['timestamp_pst'], df.iloc[-1]['timestamp_pst']

        print(f"Shape: \033[32m {df.shape} \033[0m | Times: \033[94m [[{start} - {end}]] \033[0m")

Participant 4
	participant4_easy_assessment.csv Shape: [32m (78, 66) [0m | Times: [94m [[04:41:00 PM - 04:51:55 PM]] [0m
	participant4_hard_assessment.csv Shape: [32m (185, 66) [0m | Times: [94m [[05:03:00 PM - 05:29:15 PM]] [0m
Participant 5
	participant5_easy_assessment.csv Shape: [32m (56, 66) [0m | Times: [94m [[10:11:00 AM - 10:18:55 AM]] [0m
	participant5_hard_assessment.csv Shape: [32m (190, 66) [0m | Times: [94m [[10:29:00 AM - 10:55:56 AM]] [0m
Participant 6
	participant6_easy_assessment.csv Shape: [32m (48, 66) [0m | Times: [94m [[07:27:00 PM - 07:33:34 PM]] [0m
	participant6_hard_assessment.csv Shape: [32m (132, 66) [0m | Times: [94m [[07:42:04 PM - 08:00:44 PM]] [0m
Participant 7
	participant7_easy_assessment.csv Shape: [32m (46, 66) [0m | Times: [94m [[11:32:00 AM - 11:38:14 AM]] [0m
	participant7_hard_assessment.csv Shape: [32m (105, 66) [0m | Times: [94m [[11:45:00 AM - 11:59:46 AM]] [0m
Participant 8
	participant8_easy_assessment.csv Shap

Checking for null data after merging:

In [None]:
for participant in os.listdir(MERGED_DATA):
    print(participant)
    for file in os.listdir(os.path.join(MERGED_DATA, participant)):
        print(f"\t{file}", end=" ")
        df = pd.read_csv(os.path.join(MERGED_DATA, participant, file))
        total_rows = len(df)

        # Finding columns with null values and their null count
        columns_with_nulls = df.isnull().sum()
        columns_with_nulls = columns_with_nulls[columns_with_nulls > 0]  # Filter columns with at least one null

        # Formatting the output
        output = ", ".join([f"{col} ({null_count}/{total_rows})" for col, null_count in columns_with_nulls.items()])

        # Printing the result
        print(f"Columns with null values: {output}")

Participant 4
	participant4_easy_assessment.csv Columns with null values: 
	participant4_hard_assessment.csv Columns with null values: 
Participant 5
	participant5_easy_assessment.csv Columns with null values: 
	participant5_hard_assessment.csv Columns with null values: 
Participant 6
	participant6_easy_assessment.csv Columns with null values: 
	participant6_hard_assessment.csv Columns with null values: 
Participant 7
	participant7_easy_assessment.csv Columns with null values: 
	participant7_hard_assessment.csv Columns with null values: 
Participant 8
	participant8_easy_assessment.csv Columns with null values: 
	participant8_hard_assessment.csv Columns with null values: 
Participant 9
	participant9_easy_assessment.csv Columns with null values: 
	participant9_hard_assessment.csv Columns with null values: 
Participant 10
	participant10_easy_assessment.csv Columns with null values: 
	participant10_hard_assessment.csv Columns with null values: 
Participant 11
	participant11_easy_assessment