In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from datetime import datetime,date
from scipy.signal import resample
import numpy as np
import glob
import re
import os


In [3]:
HR_files=glob.glob(r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\PhysiologicalData\Physiological Data\Empatic E4 Raw Data\*\HR.csv")
EDA_files=glob.glob(r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\PhysiologicalData\Physiological Data\Empatic E4 Raw Data\*\EDA.csv")

In [4]:
def extract_participant_number(file_path):
    # Extract the folder name from the file path
    folder_name = os.path.basename(os.path.dirname(file_path))
    
    # Use regex to find the participant number in the folder name
    match = re.search(r'\d+', folder_name)
    if match:
        return match.group()
    return None

# Extract participant numbers from HR and EDA files
HR_participant_numbers = [extract_participant_number(path) for path in HR_files]
EDA_participant_numbers = [extract_participant_number(path) for path in EDA_files]

# Print results
print("HR Files Participant Numbers:", HR_participant_numbers)
print("EDA Files Participant Numbers:", EDA_participant_numbers)

HR Files Participant Numbers: ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '24', '25', '25', '26', '27', '3', '4', '5', '6', '7', '8', '9']
EDA Files Participant Numbers: ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '23', '24', '24', '25', '25', '26', '27', '3', '4', '5', '6', '7', '8', '9']


process HR data and convert utc to texas time

In [5]:
import pandas as pd
from datetime import datetime, timedelta
import pytz

def process_hr_data(df):
    # Define timezone information
    utc_zone = pytz.utc
    texas_zone = pytz.timezone('America/Chicago')  # Texas time zone

    start_timestamp = df.iloc[0, 0]  # Starting timestamp
    sampling_frequency = df.iloc[1, 0]  # Sampling frequency

    # Convert the starting timestamp to a datetime object
    start_time = datetime.utcfromtimestamp(start_timestamp)
    start_time = utc_zone.localize(start_time)  # Localize to UTC

    # Calculate the number of samples
    num_samples = len(df) - 2  # Subtract 2 for the first two rows

    # Create a new DataFrame excluding the first two rows and resetting the index
    df = df.iloc[2:].reset_index(drop=True)
    df.columns = ['HR', 'participant']  # Rename columns

    # Generate datetime values based on sampling frequency
    datetimes = []
    for i in range(num_samples):
        # Calculate the timestamp for each row
        current_time = start_time + timedelta(seconds=(1/sampling_frequency) * i)
        datetimes.append(current_time)

    # Add the datetime column to the DataFrame
    df['datetime_utc'] = pd.Series(datetimes)

    # Convert UTC times to Texas time
    df['datetime_texas'] = df['datetime_utc'].apply(lambda x: x.astimezone(texas_zone))

    # Format the Texas datetime as a string
    df['datetime_texas_str'] = df['datetime_texas'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))

    # Convert the string back to datetime
    df['datetime_texas_reconverted'] = pd.to_datetime(df['datetime_texas_str'], format='%Y-%m-%d %H:%M:%S')

    return df


# i = 0
# HR_participant1 = pd.read_csv(HR_files[i], header=None)
# HR_participant1['participant'] = HR_participant_numbers[i]
# processed_df = process_hr_data(HR_participant1)
# processed_df


gather all HR data FMS for this participant

In [6]:
import pandas as pd
import numpy as np

def preprocess_fms_data(file_path, participant_num):
    # Load the CSV file into a DataFrame
    all_fms = pd.read_csv(file_path)
    
    # Ensure 'Participants Code' is treated as numeric
    all_fms['Participants Code'] = pd.to_numeric(all_fms['Participants Code'], errors='coerce')

    # Find the start index of the desired participant number
    start_index = all_fms[all_fms['Participants Code'] == participant_num].index.min()

    # Check if participant number is found
    if pd.isna(start_index):
        raise ValueError(f'Participant number {participant_num} not found in the file.')

    # Determine the end index for the 14 rows after the start
    end_index = start_index + 13

    # Slice the DataFrame from start_index to end_index
    all_fms = all_fms.iloc[start_index:end_index].reset_index(drop=True)

    # Convert 'Time ' column to datetime
    all_fms['DateTime'] = pd.to_datetime(all_fms['Time '], format='%Y.%m.%d %H:%M:%S:%f', errors='coerce')

    # Extract Date and Time into separate columns
    all_fms['Date'] = all_fms['DateTime'].dt.date
    all_fms['Time'] = all_fms['DateTime'].dt.strftime('%H:%M:%S.%f')  # 24-hour format

    # Drop the original 'Time ' column
    all_fms.drop(['Time '], axis=1, inplace=True)

    # Combine Date and Time into a single datetime column
    all_fms['DateTime'] = pd.to_datetime(all_fms['Date'].astype(str) + ' ' + all_fms['Time'])

    # Drop the intermediate Date and Time columns
    all_fms.drop(['Date', 'Time'], axis=1, inplace=True)

    return all_fms

# Example usage:
file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\Room\Anonymous_FMS_All.csv"
processed_fms = preprocess_fms_data(file_path, 1)
processed_fms


Unnamed: 0,Participants Code,Frame,FMS,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,DateTime
0,1.0,1998.0,0.0,,,,,2021-02-03 11:43:58.621
1,,2966.0,0.0,,,,,2021-02-03 11:44:30.629
2,,3959.0,0.0,,,,,2021-02-03 11:45:02.668
3,,4947.0,0.0,,,,,2021-02-03 11:45:34.665
4,,5961.0,0.0,,,,,2021-02-03 11:46:06.681
5,,7022.0,0.0,,,,,2021-02-03 11:46:38.669
6,,8071.0,0.0,,,,,2021-02-03 11:47:10.691
7,,9037.0,0.0,,,,,2021-02-03 11:47:42.694
8,,9999.0,0.0,,,,,2021-02-03 11:48:14.716
9,,10990.0,0.0,,,,,2021-02-03 11:48:46.732


In [7]:
import pandas as pd

def match_with_fms(HR, fms_p1):
    # Ensure 'datetime_texas_reconverted' in HR is a datetime object
    HR['datetime_texas_reconverted'] = pd.to_datetime(HR['datetime_texas_reconverted'])

    # Ensure 'DateTime' in fms_p1 is a datetime object
    fms_p1['DateTime'] = pd.to_datetime(fms_p1['DateTime'])

    # Initialize 'FMS' column in HR DataFrame
    HR['FMS'] = None

    # Iterate over the FMS data and assign FMS values to HR data within time range
    for i in range(fms_p1.shape[0] - 1):  # Use -1 to avoid out-of-bounds error
        start = fms_p1['DateTime'].iloc[i]
        end = fms_p1['DateTime'].iloc[i + 1]

        # Match HR data within the time range
        mask = (HR['datetime_texas_reconverted'] >= (start - pd.Timedelta(seconds=31))) & \
               (HR['datetime_texas_reconverted'] <= (start + pd.Timedelta(seconds=31)))
        
        # Assign FMS value based on mask
        HR.loc[mask, 'FMS'] = fms_p1['FMS'].iloc[i]

    # Filter the HR DataFrame to return only rows where 'FMS' is not None
    HR_filtered = HR[HR['FMS'].notna()]

    return HR_filtered, fms_p1



In [13]:
import os
import pandas as pd



Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
HR_participant_numbers = [extract_participant_number(path) for path in HR_files]
EDA_participant_numbers = [extract_participant_number(path) for path in EDA_files]

# Base path for saving data
base_save_path = r"D:\Save Lab\Shovons_data"

for simulation in Simulations:
    file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\{}\Anonymous_FMS_All.csv".format(simulation)
    
    for participant in HR_participant_numbers:  # Loop through a subset of participants (just the first in this case)
        try:
            # Load and process HR data for the participant
            hr_file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\PhysiologicalData\Physiological Data\Empatic E4 Raw Data\{}\HR.csv".format(participant)
            HR_participant1 = pd.read_csv(hr_file_path, header=None)
            HR_participant1['participant'] = participant
            processed_HR = process_hr_data(HR_participant1)

            # Load and preprocess FMS data
            df = preprocess_fms_data(file_path, int(participant))
            updated_HR, fms_p1 = match_with_fms(processed_HR, df)
            

            # Filter to keep only rows where 'FMS' is not None
            updated_HR_filtered = updated_HR[updated_HR['FMS'].notna()]
            print(updated_HR_filtered.shape)
            print(np.unique(updated_HR_filtered.FMS))
            # Create the directory if it doesn't exist
            save_dir = os.path.join(base_save_path, simulation, "Processed_HR")
            os.makedirs(save_dir, exist_ok=True)

            # Save the updated HR DataFrame to a CSV file
            save_file_path = os.path.join(save_dir, f"{participant}.csv")
            updated_HR_filtered.to_csv(save_file_path, index=False)

            print(f"Saved updated HR data for participant {participant} in {simulation}")

        except ValueError as ve:
            print(f"ValueError: {ve}")
        except Exception as e:
            print(f"An error occurred: {e}")


(414, 7)
[1.0 2.0]
Saved updated HR data for participant 1 in Walk
(414, 7)
[0.0 1.0]
Saved updated HR data for participant 10 in Walk
(414, 7)
[0.0 1.0]
Saved updated HR data for participant 11 in Walk
(414, 7)
[0.0 2.0]
Saved updated HR data for participant 12 in Walk
(414, 7)
[1.0 2.0]
Saved updated HR data for participant 13 in Walk
(414, 7)
[0.0]
Saved updated HR data for participant 14 in Walk
(414, 7)
[0.0]
Saved updated HR data for participant 15 in Walk
(414, 7)
[0.0]
Saved updated HR data for participant 16 in Walk
(414, 7)
[0.0]
Saved updated HR data for participant 17 in Walk
(318, 7)
[0.0]
Saved updated HR data for participant 18 in Walk
(414, 7)
[0.0]
Saved updated HR data for participant 19 in Walk
(414, 7)
[0.0 1.0]
Saved updated HR data for participant 2 in Walk
(414, 7)
[0.0]
Saved updated HR data for participant 20 in Walk
(414, 7)
[0.0 1.0 2.0]
Saved updated HR data for participant 21 in Walk
(414, 7)
[0.0 1.0 2.0 3.0]
Saved updated HR data for participant 22 in Wal

For the 24_0,24_1 files


In [11]:
import os
import pandas as pd



Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
HR_participant_numbers = [extract_participant_number(path) for path in HR_files]
# EDA_participant_numbers = [extract_participant_number(path) for path in EDA_files]

# Base path for saving data
base_save_path = r"D:\Save Lab\Shovons_data"

for simulation in Simulations:
    file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\{}\Anonymous_FMS_All.csv".format(simulation)
    
    for participant in HR_participant_numbers:  # Loop through a subset of participants (just the first in this case)
        try:
            # Load and process HR data for the participant
            hr_file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\PhysiologicalData\Physiological Data\Empatic E4 Raw Data\{}_{}\HR.csv".format(participant,0)
            HR_participant1 = pd.read_csv(hr_file_path, header=None)
            HR_participant1['participant'] = participant
            processed_HR = process_hr_data(HR_participant1)

            # Load and preprocess FMS data
            df = preprocess_fms_data(file_path, int(participant))
            updated_HR, fms_p1 = match_with_fms(processed_HR, df)

            # Filter to keep only rows where 'FMS' is not None
            updated_HR_filtered = updated_HR[updated_HR['FMS'].notna()]

            # Create the directory if it doesn't exist
            save_dir = os.path.join(base_save_path, simulation, "Processed_HR")
            os.makedirs(save_dir, exist_ok=True)

            # Save the updated HR DataFrame to a CSV file
            save_file_path = os.path.join(save_dir, f"{participant}.csv")
            updated_HR_filtered.to_csv(save_file_path, index=False)

            print(f"Saved updated HR data for participant {participant} in {simulation}")

        except ValueError as ve:
            print(f"ValueError: {ve}")
        except Exception as e:
            print(f"An error occurred: {e}")


An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\1_0\\HR.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\10_0\\HR.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\11_0\\HR.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\12_0\\HR.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\13_0\\HR.csv'
An error occurred: [Errno 2] No such file or direct

For EDA

In [16]:
import pandas as pd
from datetime import datetime, timedelta
import pytz

def process_EDA_data(df):
    # Define timezone information
    utc_zone = pytz.utc
    texas_zone = pytz.timezone('America/Chicago')  # Texas time zone

    start_timestamp = df.iloc[0, 0]  # Starting timestamp
    sampling_frequency = df.iloc[1, 0]  # Sampling frequency

    # Convert the starting timestamp to a datetime object
    start_time = datetime.utcfromtimestamp(start_timestamp)
    start_time = utc_zone.localize(start_time)  # Localize to UTC

    # Calculate the number of samples
    num_samples = len(df) - 2  # Subtract 2 for the first two rows

    # Create a new DataFrame excluding the first two rows and resetting the index
    df = df.iloc[2:].reset_index(drop=True)
    df.columns = ['EDA', 'participant']  # Rename columns

    # Generate datetime values based on sampling frequency
    datetimes = []
    for i in range(num_samples):
        # Calculate the timestamp for each row
        current_time = start_time + timedelta(seconds=(1/sampling_frequency) * i)
        datetimes.append(current_time)

    # Add the datetime column to the DataFrame
    df['datetime_utc'] = pd.Series(datetimes)

    # Convert UTC times to Texas time
    df['datetime_texas'] = df['datetime_utc'].apply(lambda x: x.astimezone(texas_zone))

    # Format the Texas datetime as a string
    df['datetime_texas_str'] = df['datetime_texas'].apply(lambda x: x.strftime('%Y-%m-%d %H:%M:%S'))

    # Convert the string back to datetime
    df['datetime_texas_reconverted'] = pd.to_datetime(df['datetime_texas_str'], format='%Y-%m-%d %H:%M:%S')

    return df


i = 0
EDA_participant1 = pd.read_csv(EDA_files[i], header=None)
EDA_participant1['participant'] = HR_participant_numbers[i]
processed_df = process_EDA_data(EDA_participant1 )
processed_df


Unnamed: 0,EDA,participant,datetime_utc,datetime_texas,datetime_texas_str,datetime_texas_reconverted
0,0.000000,1,2021-02-03 17:12:37+00:00,2021-02-03 11:12:37-06:00,2021-02-03 11:12:37,2021-02-03 11:12:37
1,0.830302,1,2021-02-03 17:12:37.250000+00:00,2021-02-03 11:12:37.250000-06:00,2021-02-03 11:12:37,2021-02-03 11:12:37
2,1.177810,1,2021-02-03 17:12:37.500000+00:00,2021-02-03 11:12:37.500000-06:00,2021-02-03 11:12:37,2021-02-03 11:12:37
3,1.616094,1,2021-02-03 17:12:37.750000+00:00,2021-02-03 11:12:37.750000-06:00,2021-02-03 11:12:37,2021-02-03 11:12:37
4,2.094130,1,2021-02-03 17:12:38+00:00,2021-02-03 11:12:38-06:00,2021-02-03 11:12:38,2021-02-03 11:12:38
...,...,...,...,...,...,...
22423,6.679711,1,2021-02-03 18:46:02.750000+00:00,2021-02-03 12:46:02.750000-06:00,2021-02-03 12:46:02,2021-02-03 12:46:02
22424,6.668179,1,2021-02-03 18:46:03+00:00,2021-02-03 12:46:03-06:00,2021-02-03 12:46:03,2021-02-03 12:46:03
22425,6.654085,1,2021-02-03 18:46:03.250000+00:00,2021-02-03 12:46:03.250000-06:00,2021-02-03 12:46:03,2021-02-03 12:46:03
22426,6.655366,1,2021-02-03 18:46:03.500000+00:00,2021-02-03 12:46:03.500000-06:00,2021-02-03 12:46:03,2021-02-03 12:46:03


In [18]:
#do the same thing for EDA
import os
import pandas as pd



Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
HR_participant_numbers = [extract_participant_number(path) for path in HR_files]
EDA_participant_numbers = [extract_participant_number(path) for path in EDA_files]

# Base path for saving data
base_save_path = r"D:\Save Lab\Shovons_data"

for simulation in Simulations:
    file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\{}\Anonymous_FMS_All.csv".format(simulation)
    
    for participant in EDA_participant_numbers:  # Loop through a subset of participants (just the first in this case)
        try:
            # Load and process HR data for the participant
            hr_file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\PhysiologicalData\Physiological Data\Empatic E4 Raw Data\{}\EDA.csv".format(participant)
            HR_participant1 = pd.read_csv(hr_file_path, header=None)
            HR_participant1['participant'] = participant
            processed_HR = process_EDA_data(HR_participant1)

            # Load and preprocess FMS data
            df = preprocess_fms_data(file_path, int(participant))
            updated_HR, fms_p1 = match_with_fms(processed_HR, df)
            

            # Filter to keep only rows where 'FMS' is not None
            updated_HR_filtered = updated_HR[updated_HR['FMS'].notna()]
            print(updated_HR_filtered.shape)
            print(np.unique(updated_HR_filtered.FMS))
            # Create the directory if it doesn't exist
            save_dir = os.path.join(base_save_path, simulation, "Processed_EDA")
            os.makedirs(save_dir, exist_ok=True)

            # Save the updated HR DataFrame to a CSV file
            save_file_path = os.path.join(save_dir, f"{participant}.csv")
            updated_HR_filtered.to_csv(save_file_path, index=False)

            print(f"Saved updated EDA data for participant {participant} in {simulation}")

        except ValueError as ve:
            print(f"ValueError: {ve}")
        except Exception as e:
            print(f"An error occurred: {e}")


(1656, 7)
[1.0 2.0]
Saved updated HR data for participant 1 in Walk
(1656, 7)
[0.0 1.0]
Saved updated HR data for participant 10 in Walk
(1656, 7)
[0.0 1.0]
Saved updated HR data for participant 11 in Walk
(1656, 7)
[0.0 2.0]
Saved updated HR data for participant 12 in Walk
(1656, 7)
[1.0 2.0]
Saved updated HR data for participant 13 in Walk
(1656, 7)
[0.0]
Saved updated HR data for participant 14 in Walk
(1656, 7)
[0.0]
Saved updated HR data for participant 15 in Walk
(1656, 7)
[0.0]
Saved updated HR data for participant 16 in Walk
(1656, 7)
[0.0]
Saved updated HR data for participant 17 in Walk
(1272, 7)
[0.0]
Saved updated HR data for participant 18 in Walk
(1656, 7)
[0.0]
Saved updated HR data for participant 19 in Walk
(1656, 7)
[0.0 1.0]
Saved updated HR data for participant 2 in Walk
(1656, 7)
[0.0]
Saved updated HR data for participant 20 in Walk
(1656, 7)
[0.0 1.0 2.0]
Saved updated HR data for participant 21 in Walk
(1656, 7)
[0.0 1.0 2.0 3.0]
Saved updated HR data for partic

In [20]:
#do the same thing for EDA
import os
import pandas as pd



Simulations = ['Walk', 'Room', 'Roller', 'Sea', 'Beach']
HR_participant_numbers = [extract_participant_number(path) for path in HR_files]
EDA_participant_numbers = [extract_participant_number(path) for path in EDA_files]

# Base path for saving data
base_save_path = r"D:\Save Lab\Shovons_data"

for simulation in Simulations:
    file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\{}\Anonymous_FMS_All.csv".format(simulation)
    
    for participant in EDA_participant_numbers:  # Loop through a subset of participants (just the first in this case)
        try:
            # Load and process HR data for the participant
            hr_file_path = r"C:\Users\purna\OneDrive\Desktop\Cybersickness_Shovon\PhysiologicalData\Physiological Data\Empatic E4 Raw Data\{}_1\EDA.csv".format(participant)
            HR_participant1 = pd.read_csv(hr_file_path, header=None)
            HR_participant1['participant'] = participant
            processed_HR = process_EDA_data(HR_participant1)

            # Load and preprocess FMS data
            df = preprocess_fms_data(file_path, int(participant))
            updated_HR, fms_p1 = match_with_fms(processed_HR, df)
            

            # Filter to keep only rows where 'FMS' is not None
            updated_HR_filtered = updated_HR[updated_HR['FMS'].notna()]
            print(updated_HR_filtered.shape)
            print(np.unique(updated_HR_filtered.FMS))
            # Create the directory if it doesn't exist
            save_dir = os.path.join(base_save_path, simulation, "Processed_EDA")
            os.makedirs(save_dir, exist_ok=True)

            # Save the updated HR DataFrame to a CSV file
            save_file_path = os.path.join(save_dir, f"{participant}.csv")
            updated_HR_filtered.to_csv(save_file_path, index=False)

            print(f"Saved updated EDA data for participant {participant} in {simulation}")

        except ValueError as ve:
            print(f"ValueError: {ve}")
        except Exception as e:
            print(f"An error occurred: {e}")


An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\1_1\\EDA.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\10_1\\EDA.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\11_1\\EDA.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\12_1\\EDA.csv'
An error occurred: [Errno 2] No such file or directory: 'C:\\Users\\purna\\OneDrive\\Desktop\\Cybersickness_Shovon\\PhysiologicalData\\Physiological Data\\Empatic E4 Raw Data\\13_1\\EDA.csv'
An error occurred: [Errno 2] No such file or d