In [1]:
#timestamp matched data by prev code
!unzip '/content/drive/MyDrive/Shovons_data (2).zip'

Archive:  /content/drive/MyDrive/Shovons_data (2).zip
   creating: Shovons_data/Beach/
   creating: Shovons_data/Beach/Processed_EDA/
  inflating: Shovons_data/Beach/Processed_EDA/1.csv  
  inflating: Shovons_data/Beach/Processed_EDA/10.csv  
  inflating: Shovons_data/Beach/Processed_EDA/11.csv  
  inflating: Shovons_data/Beach/Processed_EDA/12.csv  
  inflating: Shovons_data/Beach/Processed_EDA/13.csv  
  inflating: Shovons_data/Beach/Processed_EDA/14.csv  
  inflating: Shovons_data/Beach/Processed_EDA/15.csv  
  inflating: Shovons_data/Beach/Processed_EDA/16.csv  
  inflating: Shovons_data/Beach/Processed_EDA/17.csv  
  inflating: Shovons_data/Beach/Processed_EDA/18.csv  
  inflating: Shovons_data/Beach/Processed_EDA/19.csv  
  inflating: Shovons_data/Beach/Processed_EDA/2.csv  
  inflating: Shovons_data/Beach/Processed_EDA/20.csv  
  inflating: Shovons_data/Beach/Processed_EDA/21.csv  
  inflating: Shovons_data/Beach/Processed_EDA/22.csv  
  inflating: Shovons_data/Beach/Processed_E

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
from scipy.signal import resample


In [6]:

def resample_EDA(df, original_freq, new_freq):
    # Ensure 'datetime_texas_reconverted' is in datetime format and set it as the index
    if not pd.api.types.is_datetime64_any_dtype(df['datetime_texas_reconverted']):
        df['datetime_texas_reconverted'] = pd.to_datetime(df['datetime_texas_reconverted'])
    df = df.set_index('datetime_texas_reconverted').sort_index()

    # Calculate the resampling factor and the number of samples for the new frequency
    resampling_factor = new_freq / original_freq
    n_samples = int(len(df) * resampling_factor)

    # Resample the EDA data using scipy's resample function
    resampled_eda = resample(df['EDA'].values, n_samples)

    # Downsample or upsample FMS without interpolation
    if resampling_factor > 1:
        # Upsample: repeat FMS values to match new frequency
        resampled_fms = df['FMS'].values.repeat(int(resampling_factor))[:n_samples]
    else:
        # Downsample: select every nth value based on resampling factor
        resampled_fms = df['FMS'].values[::int(1 / resampling_factor)]

    # Handle case where length of resampled FMS is less than n_samples due to rounding
    if len(resampled_fms) < n_samples:
        resampled_fms = pd.Series(resampled_fms).reindex(range(n_samples), method='ffill').values

    # Assign the mode of the participant column
    participant = df['participant'].mode()[0]

    # Create a new datetime index for the resampled data
    resampled_time_index = pd.date_range(start=df.index.min(),
                                         end=df.index.max(),
                                         periods=n_samples)

    # Create a new DataFrame with the resampled data
    df_resampled = pd.DataFrame({'EDA': resampled_eda, 'FMS': resampled_fms, 'participant': participant}, index=resampled_time_index)

    return df_resampled


In [7]:
Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
original_freq_EDA = 4
new_freq = 1
all_eda=[]
for sim in Simulations:
  EDA_files=glob.glob('/content/Shovons_data/{}/Processed_EDA/*'.format(sim))
  for f in EDA_files:
    try:
      EDA=pd.read_csv(f)
      resampled_EDA=resample_EDA(EDA,original_freq_EDA,new_freq)
      resampled_EDA['simulation']=sim
      resampled_EDA['datetime']=resampled_EDA.index
      resampled_EDA=resampled_EDA.reset_index(drop=True)
      all_eda.append(resampled_EDA)
    except:pass


all_EDA_df = pd.concat(all_eda, ignore_index=True)

all_EDA_df

Unnamed: 0,EDA,FMS,participant,simulation,datetime
0,4.404642,0.0,18,Walk,2021-03-15 12:19:55
1,3.587514,0.0,18,Walk,2021-03-15 12:19:56
2,3.808593,0.0,18,Walk,2021-03-15 12:19:57
3,3.652929,0.0,18,Walk,2021-03-15 12:19:58
4,3.754818,0.0,18,Walk,2021-03-15 12:19:59
...,...,...,...,...,...
44019,4.508173,0.0,2,Beach,2021-02-03 15:54:19
44020,4.322579,0.0,2,Beach,2021-02-03 15:54:20
44021,4.505687,0.0,2,Beach,2021-02-03 15:54:21
44022,4.522895,0.0,2,Beach,2021-02-03 15:54:22


In [8]:
all_EDA_df.FMS.unique()

array([ 0. ,  1. ,  2. ,  3. ,  6. ,  4. ,  8. ,  5. ,  7. ,  2.5, 10. ])

In [None]:
all_EDA_df.participant.unique()

array([18,  4, 13, 16, 14,  3,  8, 19,  9, 21, 17, 15, 10, 11,  1, 20,  5,
        7, 12,  6, 22, 23,  2, 24])

In [9]:
import pandas as pd
from scipy.signal import resample
import glob

def resample_HR(df, original_freq, new_freq):
    """
    Resample the HR data from the original frequency to the new frequency while keeping FMS and participant columns.

    Parameters:
    df (DataFrame): The input DataFrame containing the HR data with a datetime index.
    original_freq (int): The original frequency in Hz (e.g., 4 for 4 Hz).
    new_freq (int): The new desired frequency in Hz (e.g., 1 for 1 Hz).

    Returns:
    DataFrame: A new DataFrame containing the resampled HR data along with FMS and participant columns.
    """
    # Ensure 'datetime_texas_reconverted' is in datetime format and set it as index
    if not pd.api.types.is_datetime64_any_dtype(df['datetime_texas_reconverted']):
        df['datetime_texas_reconverted'] = pd.to_datetime(df['datetime_texas_reconverted'])
    df = df.set_index('datetime_texas_reconverted').sort_index()

    # Calculate the resampling factor and the number of samples for the new frequency
    resampling_factor = new_freq / original_freq
    n_samples = int(len(df) * resampling_factor)

    # Resample the HR data using scipy's resample function
    resampled_hr = resample(df['HR'].values, n_samples)

    # Downsample or upsample FMS without interpolation
    if resampling_factor > 1:
        # Upsample: repeat FMS values to match the new frequency
        resampled_fms = df['FMS'].values.repeat(int(resampling_factor))[:n_samples]
    else:
        # Downsample: select every nth value based on the resampling factor
        resampled_fms = df['FMS'].values[::int(1 / resampling_factor)]

    # Handle case where length of resampled FMS is less than n_samples due to rounding
    if len(resampled_fms) < n_samples:
        resampled_fms = pd.Series(resampled_fms).reindex(range(n_samples), method='ffill').values

    # Retain participant number as the mode or first value (assuming it doesn't change frequently)
    participant = df['participant'].mode()[0]

    # Create a new datetime index for the resampled data
    resampled_time_index = pd.date_range(start=df.index.min(),
                                         end=df.index.max(),
                                         periods=n_samples)

    # Create a new DataFrame with the resampled data
    df_resampled = pd.DataFrame({'HR': resampled_hr, 'FMS': resampled_fms, 'participant': participant}, index=resampled_time_index)

    return df_resampled

# List of simulations
Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
original_freq_HR = 1
new_freq = 1
all_hr = []

for sim in Simulations:
    # Get all HR files for the current simulation
    HR_files = glob.glob('/content/Shovons_data/{}/Processed_HR/*'.format(sim))
    for f in HR_files:
        try:
            HR = pd.read_csv(f)
            # Resample HR data
            resampled_HR = resample_HR(HR, original_freq_HR, new_freq)
            # Add simulation and datetime information
            resampled_HR['simulation'] = sim
            resampled_HR['datetime'] = resampled_HR.index
            resampled_HR = resampled_HR.reset_index(drop=True)
            # Add resampled data to the list
            all_hr.append(resampled_HR)
        except Exception as e:
            print(f"Error processing file {f}: {e}")
            pass

# Combine all resampled HR DataFrames into a single DataFrame
all_HR_df = pd.concat(all_hr, ignore_index=True)

print(all_HR_df)


Error processing file /content/Shovons_data/Walk/Processed_HR/25.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Walk/Processed_HR/24.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Room/Processed_HR/16.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Room/Processed_HR/25.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Room/Processed_HR/24.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Room/Processed_HR/7.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Roller/Processed_HR/10.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Roller/Processed_HR/25.csv: invalid number of data points (0) specified
Error processing file /content/Shovons_data/Roller/Processed_HR/24.csv: invalid numbe

In [14]:
import pandas as pd
from scipy.signal import resample

def resample_eye_tracking(df, original_freq, new_freq):
    """
    Resample the eye-tracking data from the original frequency to the new frequency while keeping FMS and participant columns.

    Parameters:
    df (DataFrame): The input DataFrame containing the eye-tracking data with a datetime index.
    original_freq (int): The original frequency in Hz (e.g., 4 for 4 Hz).
    new_freq (int): The new desired frequency in Hz (e.g., 1 for 1 Hz).

    Returns:
    DataFrame: A new DataFrame containing the resampled eye-tracking data along with FMS and participant columns.
    """
    # Ensure 'datetime' is in datetime format and set it as index
    if not pd.api.types.is_datetime64_any_dtype(df['datetime']):
        df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.set_index('datetime').sort_index()

    # Convert boolean columns to integers (0/1)
    bool_columns = df.select_dtypes(include=['bool']).columns
    for col in bool_columns:
        df[col] = df[col].astype(int)

    # Convert any non-numeric columns to numeric, forcing errors to NaN and then filling with 0
    df = df.apply(pd.to_numeric, errors='coerce').fillna(0)

    # Calculate the resampling factor and the number of samples for the new frequency
    resampling_factor = new_freq / original_freq
    n_samples = int(len(df) * resampling_factor)

    # List of columns to be resampled
    eye_tracking_columns = ['ConvergenceValid', 'Convergence_distance', 'Left_Eye_Openness',
                            'Right_Eye_Openness', 'Left_Eye_Closed', 'Right_Eye_Closed',
                            'LeftPupilDiameter', 'RightPupilDiameter', 'LeftPupilPosInSensorX',
                            'LeftPupilPosInSensorY', 'RightPupilPosInSensorX', 'RightPupilPosInSensorY',
                            'LocalGazeValid', 'GazeOriginLclSpc_X', 'GazeOriginLclSpc_Y',
                            'GazeOriginLclSpc_Z', 'GazeDirectionLclSpc_X', 'GazeDirectionLclSpc_Y',
                            'GazeDirectionLclSpc_Z', 'WorldGazeValid', 'GazeOriginWrldSpc_X',
                            'GazeOriginWrldSpc_Y', 'GazeOriginWrldSpc_Z', 'GazeDirectionWrldSpc_X',
                            'GazeDirectionWrldSpc_Y', 'GazeDirectionWrldSpc_Z', 'NrmLeftEyeOriginX',
                            'NrmLeftEyeOriginY', 'NrmLeftEyeOriginZ', 'NrmRightEyeOriginX',
                            'NrmRightEyeOriginY', 'NrmRightEyeOriginZ', 'NrmSRLeftEyeGazeDirX',
                            'NrmSRLeftEyeGazeDirY', 'NrmSRLeftEyeGazeDirZ', 'NrmSRRightEyeGazeDirX',
                            'NrmSRRightEyeGazeDirY', 'NrmSRRightEyeGazeDirZ']

    # Resample the eye-tracking data columns
    resampled_data = {}
    for col in eye_tracking_columns:
        if col in df.columns:
            resampled_data[col] = resample(df[col].values, n_samples)

    # Resample FMS using simple interpolation
    if 'FMS' in df.columns:
        resampled_data['FMS'] = resample(df['FMS'].values, n_samples)

    # Retain participant number as the mode or first value
    participant = df['participant'].mode()[0] if 'participant' in df.columns else None

    # Create a new datetime index for the resampled data
    resampled_time_index = pd.date_range(start=df.index.min(), end=df.index.max(), periods=n_samples)

    # Create a new DataFrame with the resampled data
    df_resampled = pd.DataFrame(resampled_data, index=resampled_time_index)
    if participant is not None:
        df_resampled['participant'] = participant

    return df_resampled




In [15]:
import pandas as pd
import glob
from scipy.signal import resample

# List of simulations
Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
original_freq_eye_tracking = 25  # Replace with actual original frequency of eye-tracking data
new_freq = 1
all_eye_tracking = []

for sim in Simulations:
    # Get all eye-tracking files for the current simulation
    eye_tracking_files = glob.glob('/content/Shovons_data/{}/Processed_Eye_Data/*'.format(sim))
    for f in eye_tracking_files:
        try:
            # Read eye-tracking file
            eye_tracking_data = pd.read_csv(f)
            # Resample eye-tracking data
            resampled_eye_tracking = resample_eye_tracking(eye_tracking_data, original_freq_eye_tracking, new_freq)
            # Add simulation and datetime information
            resampled_eye_tracking['simulation'] = sim
            resampled_eye_tracking['datetime'] = resampled_eye_tracking.index
            resampled_eye_tracking = resampled_eye_tracking.reset_index(drop=True)
            # Add resampled data to the list
            all_eye_tracking.append(resampled_eye_tracking)
        except Exception as e:
            print(f"Error processing file {f}: {e}")
            pass

# # Combine all resampled eye-tracking DataFrames into a single DataFrame
all_eye_tracking_df = pd.concat(all_eye_tracking, ignore_index=True)

all_eye_tracking_df


Unnamed: 0,ConvergenceValid,Convergence_distance,Left_Eye_Openness,Right_Eye_Openness,Left_Eye_Closed,Right_Eye_Closed,LeftPupilDiameter,RightPupilDiameter,LeftPupilPosInSensorX,LeftPupilPosInSensorY,...,NrmSRLeftEyeGazeDirX,NrmSRLeftEyeGazeDirY,NrmSRLeftEyeGazeDirZ,NrmSRRightEyeGazeDirX,NrmSRRightEyeGazeDirY,NrmSRRightEyeGazeDirZ,FMS,participant,simulation,datetime
0,0.0,1.785024,0.999987,0.979293,0.0,0.0,3.624418,3.609259,0.592792,0.643425,...,0.023150,0.015225,1.015387,0.027961,0.035547,0.990978,0.0,14.0,Walk,2021-03-11 04:50:21.127000000
1,0.0,2.789748,0.876244,0.999467,0.0,0.0,3.084810,3.277388,0.585758,0.631885,...,0.032730,-0.188671,0.938829,0.055192,-0.193261,0.964752,0.0,14.0,Walk,2021-03-11 04:50:22.079832199
2,0.0,1.254099,0.749887,0.925947,0.0,0.0,3.752556,3.833147,0.620758,0.750059,...,-0.032861,-0.449547,0.909447,0.000874,-0.433422,0.888901,0.0,14.0,Walk,2021-03-11 04:50:23.032664399
3,0.0,2.905156,0.875379,0.959759,0.0,0.0,3.513752,3.844097,0.576725,0.607670,...,0.058332,-0.066038,0.963669,0.073355,-0.070336,0.990794,0.0,14.0,Walk,2021-03-11 04:50:23.985496598
4,0.0,1.328554,0.605836,0.565532,0.0,0.0,3.788993,3.736045,0.624640,0.850274,...,-0.067822,-0.535419,0.882186,-0.031532,-0.516275,0.822130,0.0,14.0,Walk,2021-03-11 04:50:24.938328798
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52260,0.0,1.796729,1.002228,0.929566,0.0,0.0,3.764875,3.851752,0.575092,0.512146,...,-0.040617,-0.043211,1.007307,0.000806,-0.049062,1.014705,0.0,21.0,Beach,2021-03-19 04:58:45.012555555
52261,0.0,2.259568,0.980277,1.013608,0.0,0.0,3.659034,3.704881,0.546072,0.523638,...,0.055826,-0.065064,0.993041,0.052835,-0.043429,0.980186,0.0,21.0,Beach,2021-03-19 04:58:46.118166666
52262,0.0,1.988414,1.016704,0.995685,0.0,0.0,3.904801,3.948334,0.567938,0.517975,...,-0.013130,-0.043508,0.990765,0.010779,-0.048522,1.011565,0.0,21.0,Beach,2021-03-19 04:58:47.223777777
52263,0.0,2.003546,0.913074,0.752881,0.0,0.0,3.677037,3.804631,0.563355,0.537656,...,-0.004083,-0.116024,0.951773,0.027435,-0.119746,0.974311,0.0,21.0,Beach,2021-03-19 04:58:48.329388888


In [16]:
def convert_datetime_format(df, datetime_column):
    """
    Convert datetime strings with nanoseconds to standard datetime format, and convert 12-hour time to 24-hour format.

    Parameters:
    df (DataFrame): The DataFrame with the datetime column.
    datetime_column (str): Name of the column with datetime strings.

    Returns:
    DataFrame: DataFrame with the datetime column in the standard 24-hour format.
    """
    # Convert the datetime column to pandas datetime object with 12-hour format
    df[datetime_column] = pd.to_datetime(df[datetime_column], format='%Y-%m-%d %I:%M:%S.%f%p', errors='coerce')

    # Format the datetime object to the desired 24-hour format
    df[datetime_column] = df[datetime_column].dt.strftime('%Y-%m-%d %H:%M:%S')

    return df

def sort_dataframe(df):

    required_columns = ['datetime', 'simulation', 'participant']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")


    sorted_df = df.sort_values(by=['datetime', 'simulation', 'participant'])

    return sorted_df
def convert_datetime_to_12h_format(df, datetime_column):


    if not pd.api.types.is_datetime64_any_dtype(df[datetime_column]):
        df[datetime_column] = pd.to_datetime(df[datetime_column])


    df[datetime_column] = df[datetime_column].dt.strftime('%Y-%m-%d %I:%M:%S')

    return df

all_EDA_df = convert_datetime_to_12h_format(all_EDA_df, 'datetime')
all_HR_df = convert_datetime_to_12h_format(all_HR_df, 'datetime')

all_EDA_df['datetime'] = pd.to_datetime(all_EDA_df['datetime'])
all_HR_df['datetime'] = pd.to_datetime(all_HR_df['datetime'])
all_eye_tracking_df = convert_datetime_format(all_eye_tracking_df, 'datetime')
all_eye_tracking_df['datetime'] = pd.to_datetime(all_eye_tracking_df['datetime'])


all_HR_sorted = sort_dataframe(all_HR_df)
all_EDA_sorted = sort_dataframe(all_EDA_df)
all_eye_tracking_sorted = sort_dataframe(all_eye_tracking_df)


merged_df = all_HR_sorted.merge(all_EDA_sorted, on=['datetime', 'simulation', 'participant'], how='inner')
merged_df = merged_df.merge(all_eye_tracking_sorted, on=['datetime', 'simulation', 'participant'], how='inner')


merged_df = merged_df.reset_index(drop=True)

merged_df


Unnamed: 0,HR,FMS_x,participant,simulation,datetime,EDA,FMS_y,ConvergenceValid,Convergence_distance,Left_Eye_Openness,...,NrmRightEyeOriginX,NrmRightEyeOriginY,NrmRightEyeOriginZ,NrmSRLeftEyeGazeDirX,NrmSRLeftEyeGazeDirY,NrmSRLeftEyeGazeDirZ,NrmSRRightEyeGazeDirX,NrmSRRightEyeGazeDirY,NrmSRRightEyeGazeDirZ,FMS
0,87.17,0.0,3,Room,2021-01-31 03:56:05,0.489113,0.0,0.0,1.627729,0.824085,...,-26.162909,0.332944,-21.201295,0.061963,-0.174169,0.928406,0.080992,-0.176389,0.926115,0.0
1,87.08,0.0,3,Room,2021-01-31 03:56:06,0.500523,0.0,0.0,1.401985,0.888882,...,-27.426396,1.313084,-23.283278,-0.048401,-0.001604,0.880168,-0.017797,-0.020057,0.879527,0.0
2,87.00,0.0,3,Room,2021-01-31 03:56:07,0.489022,0.0,0.0,1.216078,0.948594,...,-26.285001,1.056972,-22.401003,0.056996,-0.046641,0.940760,0.087532,-0.059682,0.940018,0.0
3,86.92,0.0,3,Room,2021-01-31 03:56:08,0.505631,0.0,0.0,1.797394,0.844059,...,-27.315629,1.796468,-23.063976,-0.067061,0.082149,0.814352,-0.059380,0.071786,0.810093,0.0
4,86.82,0.0,3,Room,2021-01-31 03:56:09,0.504639,0.0,0.0,1.915843,1.054958,...,-27.330424,1.484339,-21.848875,-0.060468,0.100985,1.041450,-0.077686,0.105758,1.043533,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46980,75.90,0.0,11,Room,2021-04-06 08:18:21,0.463588,0.0,0.0,1.552775,0.936591,...,-36.678450,-1.242440,-30.126566,-0.125735,-0.163732,0.875975,-0.120102,-0.196168,0.904884,0.0
46981,75.90,0.0,11,Room,2021-04-06 08:18:21,0.463588,0.0,0.0,1.352621,1.007385,...,-37.015291,-1.086570,-29.754313,-0.154191,-0.046181,1.006536,-0.135931,-0.063572,0.998483,0.0
46982,75.90,0.0,11,Room,2021-04-06 08:18:22,0.463283,0.0,0.0,1.578042,0.990420,...,-36.500736,-0.491942,-29.835181,-0.074450,0.013121,0.979229,-0.041406,-0.016397,0.987175,0.0
46983,75.90,0.0,11,Room,2021-04-06 08:18:23,0.465536,0.0,0.0,1.106370,1.003148,...,-36.441868,-0.977157,-30.436562,-0.097231,-0.187593,0.990074,-0.027457,-0.191489,0.987408,0.0


In [20]:
merged_df.to_csv('/content/drive/MyDrive/Save/merged_data_all_participants.csv', index=False)
