Dated Started: 2021-08-23

# Pre-Processing for Virage Experiment
## Copied from MainClassifier


In [8]:
import sys
sys.path.insert(0, '../../IDEaSv2')

In [9]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import neurokit2 as nk
from scipy.stats import skew, kurtosis, iqr

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report
from feat_functions.main_utils import *

In [10]:
def ecg_sub_func(df, ecg_sample_rt=512):
    df.reset_index(drop=True, inplace=True) # resetting the index after dropping nan rows
    # converting the timestamps to float to make the data timestamps consistent
    df['Timestamp'] = df['Timestamp'].astype('float')

    # creating a list of all timestamps that should have been there if there was no missing datapoints.
    time_list = ([df.loc[0, 'Timestamp'] + (x * (1000/ecg_sample_rt)) for x in range(0, int((df.loc[df.index[-1], 'Timestamp'] - df.loc[0, 'Timestamp'])/(1000/ecg_sample_rt)) + 1)])
    
    # creating a dataframe from the time_list that has all the timestamps (missing + not missing)
    df_ecg = pd.DataFrame(time_list, columns = ['timestamp'])

    # rounding the timestamps to 1 place decimal as then it would be more easier to compare timestamps!
    df_ecg['timestamp'] = df_ecg['timestamp'].round(decimals = 1)
    df_ecg.index = df_ecg['timestamp'] # shifting the timestamps to index

    df['Timestamp'] = df['Timestamp'].round(decimals = 1)
    df.index = df['Timestamp']

    df_new = pd.concat([df_ecg, df], axis = 1)
    df_new.drop(columns = ['Timestamp'], inplace=True)
    df_new.reset_index(inplace=True, drop=True)

    return df_new.copy()

def eda_sub_func(df, eda_sample_rt=128):
    df.reset_index(drop=True, inplace=True) # resetting the index after dropping nan rows
    # converting the timestamps to float to make the data timestamps consistent
    df['Timestamp'] = df['Timestamp'].astype('float')

    # creating a list of all timestamps that should have been there if there was no missing datapoints.
    time_list = ([df.loc[0, 'Timestamp'] + (x * (1000/eda_sample_rt)) for x in range(0, int((df.loc[df.index[-1], 'Timestamp'] - df.loc[0, 'Timestamp'])/(1000/eda_sample_rt)) + 1)])
    
    # creating a dataframe from the time_list that has all the timestamps (missing + not missing)
    df_eda = pd.DataFrame(time_list, columns = ['timestamp'])

    # rounding the timestamps to 1 place decimal as then it would be more easier to compare timestamps!
    df_eda['timestamp'] = df_eda['timestamp'].round(decimals = 1)
    df_eda.index = df_eda['timestamp'] # shifting the timestamps to index

    df['Timestamp'] = df['Timestamp'].round(decimals = 1)
    df.index = df['Timestamp']

    df_new = pd.concat([df_eda, df], axis = 1)
    df_new.drop(columns = ['Timestamp'], inplace=True)
    df_new.reset_index(inplace=True, drop=True)

    return df_new.copy()

## Baseline Feature Extraction

## Removing the imputation and cleaning the whole signal here!

In [11]:
main_path = r"X:\IDEaS\Driving Simulator\Signals_cp"
save_pth = r'X:\RealTimeSegment\Driving Simulator\Raw\ECG_EDA_baseline'
ecg_sample_rt = 512
subjects_id = os.listdir(main_path)
dirlist = os.listdir(os.path.join(main_path, subjects_id[0]))
# exp_id = [x for x in dirlist if 'level_' in x] # op: ['level_1.csv', 'level_2.csv', ...]
exp_id = ['baseline']

rd_cols = ['Timestamp', 'ECG LL-RA CAL',
           'ECG LA-RA CAL', 'ECG Vx-RL CAL']

# for sub_id in subjects_id:
for sub_id in subjects_id:
    subject_path = os.path.join(main_path, sub_id)
    print(sub_id)

    for xid in exp_id:
        try:
            read_path = os.path.join(subject_path, '{}.csv'.format(xid))
            df = pd.read_csv(read_path, skipinitialspace=True, usecols=rd_cols)

            df.dropna(inplace=True) # removing all the nan rows

            # Putting a check if the signal data is not present in the csv then skip that subject
            if len(df) == 0:
                print('Subject {} does not have signal data for session: {}'.format(sub_id, xid))
                continue

            # df_new = ecg_sub_func(df, ecg_sample_rt)

            # num_drops = df_new['ECG LL-RA CAL'].isna().sum()
            # if num_drops > len(df_new) * 0.05:
            #     print(xid)
            #     continue

            # df_ecg_new = impute_ecg(df_new.copy())
            
            # # cleaning the ECG signals
            # df_ecg_new = ecg_cleaner(df_ecg_new)
            
            # csv_path = r'X:\IDEaS\Driving Simulator\Data\Interpolated_ECG_EDA_baseline\{}'.format(sub_id)
            csv_path = os.path.join(save_pth, '{}'.format(sub_id))
            mk_dirs(csv_path)
            df.to_csv(os.path.join(csv_path, 'ecg_{}.csv'.format(xid)), index=False)

        except FileNotFoundError:
            # exp_3 for subject 1674 was not recorded :(
            continue

1030
1105
1106
1241
1271
1314
1323
1337
1372
1417
1434
1544
1547
1595
1629
1716
1717
1744
1868
1892
1953


In [12]:
main_path = r"X:\IDEaS\Driving Simulator\Signals_cp"
save_pth = r'X:\RealTimeSegment\Driving Simulator\Raw\ECG_EDA_baseline'
eda_sample_rt = 128
subjects_id = os.listdir(main_path)
dirlist = os.listdir(os.path.join(main_path, subjects_id[0]))
# exp_id = [x for x in dirlist if 'level_' in x] # op: ['level_1.csv', 'level_2.csv', ...]
exp_id = ['baseline']

rd_cols = ['Timestamp', 'GSR Conductance CAL']

# for sub_id in subjects_id:
for sub_id in subjects_id:
    subject_path = os.path.join(main_path, sub_id)
    # print(sub_id)

    for xid in exp_id:
        try:
            read_path = os.path.join(subject_path, '{}.csv'.format(xid))
            df = pd.read_csv(read_path, skipinitialspace=True, usecols=rd_cols)

            df.dropna(inplace=True) # removing all the nan rows

            # Putting a check if the signal data is not present in the csv then skip that subject
            if len(df) == 0:
                print('Subject {} does not have signal data for session: {}'.format(sub_id, xid))
                continue

            # df_new = eda_sub_func(df, eda_sample_rt)

            # num_drops = df_new['GSR Conductance CAL'].isna().sum()
            # if num_drops > len(df_new) * 0.05:
            #     print(xid)
            #     continue

            # df_eda_new = impute_eda(df_new)
            
            # # cleaning the EDA signals
            # df_eda_new = eda_cleaner(df_eda_new)
            # df_eda_new = eda_decom(df_eda_new)

            # csv_path = r'X:\IDEaS\Driving Simulator\Data\Interpolated_ECG_EDA_baseline\{}'.format(sub_id)
            csv_path = os.path.join(save_pth, '{}'.format(sub_id))
            
            mk_dirs(csv_path)
            df.to_csv(os.path.join(csv_path, 'eda_{}.csv'.format(xid)), index=False)

        except FileNotFoundError:
            # exp_3 for subject 1674 was not recorded :(
            continue

## Imputing missing datapoints in ECG signal.

### Part 1

In [14]:
main_path = r"X:\IDEaS\Driving Simulator\Signals_cp"
ecg_sample_rt = 512
subjects_id = os.listdir(main_path)
dirlist = os.listdir(os.path.join(main_path, subjects_id[0]))
exp_id = [x for x in dirlist if 'level_' in x] # op: ['level_1.csv', 'level_2.csv', ...]

rd_cols = ['Timestamp', 'ECG LL-RA CAL',
           'ECG LA-RA CAL', 'ECG Vx-RL CAL']

# for sub_id in subjects_id:
for sub_id in subjects_id:
    subject_path = os.path.join(main_path, sub_id)

    print(sub_id)

    for xid in exp_id:
        try:
            read_path = os.path.join(subject_path, '{}'.format(xid))
            df = pd.read_csv(read_path, dtype='object')
            if df.columns[0] == '#INFO':
                df = pd.read_csv(read_path, skiprows = 32, skipinitialspace=True, usecols=rd_cols)
            else: 
                df = pd.read_csv(read_path, usecols=rd_cols)

            df.dropna(inplace=True) # removing all the nan rows

            # Putting a check if the signal data is not present in the csv then skip that subject
            if len(df) == 0:
                print('Subject {} does not have signal data for session: {}'.format(sub_id, xid))
                continue

            # df_new = ecg_sub_func(df, ecg_sample_rt)

            # num_drops = df_new['ECG LL-RA CAL'].isna().sum()

            # if num_drops > len(df_new) * 0.05:
            #     print(xid)
            #     continue

            # df_ecg_new = impute_ecg(df_new.copy())
            
            # # cleaning the ECG signals
            # df_ecg_new_1 = ecg_cleaner(df_ecg_new.copy())
            
           
            csv_path = r'X:\RealTimeSegment\Driving Simulator\Raw\ECG_EDA\{}'.format(sub_id)
            
            mk_dirs(csv_path)

            df.to_csv(os.path.join(csv_path, 'ecg_{}'.format(xid)), index=False)

        except FileNotFoundError:
            # exp_3 for subject 1674 was not recorded :(
            continue

1030
1105
1106
1241
1271
1314
1323


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


1337
1372
1417
1434
1544
1547
1595
1629
1716
1717
Subject 1717 does not have signal data for session: level_9.csv
1744
1868
1892
1953


## Not Imputing missing values in EDA signal. 

### Processing Step 1

In [15]:
main_path = r"X:\IDEaS\Driving Simulator\Signals_cp"
eda_sample_rt = 128
subjects_id = os.listdir(main_path)


rd_cols = ['Timestamp', 'GSR Conductance CAL']

for sub_id in subjects_id:

    dirlist = os.listdir(os.path.join(main_path, sub_id))
    exp_id = [x for x in dirlist if 'level_' in x] # op: ['level_1.csv', 'level_2.csv', ...]    
    exp_id = [x.replace('.csv', '') for x in exp_id]

    subject_path = os.path.join(main_path, sub_id)
    print(sub_id)

    for xid in exp_id:
        try:
            read_path = os.path.join(subject_path, '{}.csv'.format(xid))
            df = pd.read_csv(read_path, dtype='object')
            if df.columns[0] == '#INFO':
                df = pd.read_csv(read_path, skiprows = 32, skipinitialspace=True, usecols=rd_cols)
            else: 
                df = pd.read_csv(read_path, usecols=rd_cols)
            
            df.dropna(inplace=True) # removing all the nan rows

            # Putting a check if the signal data is not present in the csv then skip that subject
            if len(df) == 0:
                print('Subject {} does not have signal data for session: {}'.format(sub_id, xid))
                continue

            # df_new = eda_sub_func(df, eda_sample_rt)

            # num_drops = df_new['GSR Conductance CAL'].isna().sum()

            # if num_drops > len(df_new) * 0.05:
            #     print(xid)
            #     continue

            # df_eda_new = impute_eda(df_new.copy())
            
            # # cleaning the EDA signals
            # df_eda_new_1 = eda_cleaner(df_eda_new.copy())

            # df_eda_new_2 = eda_decom(df_eda_new_1.copy())

            csv_path = r'X:\RealTimeSegment\Driving Simulator\Raw\ECG_EDA\{}'.format(sub_id)
            
            mk_dirs(csv_path)
            df.to_csv(os.path.join(csv_path, 'eda_{}.csv'.format(xid)), index=False)

        except FileNotFoundError:
            continue

1030
1105
1106
1241
1271
1314
1323


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Subject 1323 does not have signal data for session: level_6
1337
1372
1417
1434
1544
1547
1595
1629
1716
1717
1744
1868
1892
1953


## Preparing Labels

In [15]:
# Not run in this file

# Preparing the labels
main_path = r"X:\IDEaS\Driving Simulator\Data\Interpolated_ECG_EDA"
subjects_id = os.listdir(main_path)
label_path = r"X:\IDEaS\Driving Simulator\driving_sim_cog_cp.xlsx"
labels_dir = r"X:\IDEaS\Driving Simulator\Data\New_Labels"
mk_dirs(labels_dir)
read_cols = ['time', 'level_1', 'level_2', 'level_3', 'level_4',
 'level_5', 'level_6', 'level_7', 'level_8', 'level_9']
for subs in subjects_id:
    print(subs)
    sub_labels = pd.read_excel(label_path, sheet_name = str(subs), skiprows=1, names=read_cols)
    # sub_labels.rename(columns = {'Unnamed: 0': 'time'}, inplace=True)
    sub_labels[sub_labels.columns] = sub_labels[sub_labels.columns].replace(['na', 'no response', 'no rep', np.nan, 'red'], np.nan)

    try:
        df_labels = fill_multi(sub_labels, 'level_1')
        df_labels = fill_multi(df_labels, 'level_2')
        df_labels = fill_multi(df_labels, 'level_3')
        df_labels = fill_multi(df_labels, 'level_4')
        df_labels = fill_multi(df_labels, 'level_5')
        df_labels = fill_multi(df_labels, 'level_6')
        df_labels = fill_multi(df_labels, 'level_7')
        df_labels = fill_multi(df_labels, 'level_8')
        df_labels = fill_multi(df_labels, 'level_9')
    except KeyError:
        pass
    df_labels = df_labels.replace(to_replace = [np.nan], method='bfill')
    df_labels = df_labels.replace(to_replace = [np.nan], method='ffill')

    df_labels.to_csv(os.path.join(labels_dir, '{}.csv'.format(subs)), index = False)

1030
1105
1106
1241
1271
1314
1323
1337
1372
1417
1434
1544
1547
1595
1629
1716
1717
1744
1868
1892
1953


In [19]:
len(df[0:1280])

1280