Dated Changed: 2021-10-21

## Import Packages

In [1]:
import sys
# sys.path.insert(0, '../../IDEaSv2')

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import neurokit2 as nk
from scipy.stats import skew, kurtosis, iqr

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report
from feat_functions.main_utils import *

In [3]:
from biosppy import tools as st

### Imputing EEG

In [4]:
def clean_eeg(signal=None, sampling_rate=256.):

    # Butterworth filter
    b, a = st.get_filter(ftype='butter',
                         band='highpass',
                         order=8,
                         frequency=4,
                         sampling_rate=sampling_rate)

    aux, _ = st._filter_signal(b, a, signal=signal, check_phase=True, axis=0)

    # low pass filter
    b, a = st.get_filter(ftype='butter',
                         band='lowpass',
                         order=16,
                         frequency=40,
                         sampling_rate=sampling_rate)

    filtered, _ = st._filter_signal(b, a, signal=aux, check_phase=True, axis=0)

    return filtered

In [11]:
# fixing flaw of missing values in ECG that will be dropped in above case but ideally should not be dropped.
main_path = r"X:\IDEaS\Full\August_31_2022\MatBII"

eeg_sample_rt = 256
subjects_id = os.listdir(main_path)
exp_id = ['exp_0', 'exp_1', 'exp_2', 'exp_3']
rd_cols = ['Timestamp', 'TP9', 'AF7', 'AF8', 'TP10']

for sub_id in range(len(subjects_id)):
    
    subject_path = os.path.join(main_path, subjects_id[sub_id])

    print(subjects_id[sub_id])

    for xid in exp_id:
        try:
            if subjects_id[sub_id] == '1544' and (xid in ['eeg_exp_1', 'eeg_exp_3', 'eeg_exp_2']):
                # for exp_1 ECG recording was stopped after 2 mins :(
                # Shimmer ECG sensor was not configured for ECG; hence no ECG was recorded. 
                continue
            # if subjects_id[sub_id] != '1544':
            #     continue
            read_path = os.path.join(subject_path, 'eeg_{}.csv'.format(xid))
            df = pd.read_csv(read_path, dtype='object')
            if df.columns[0] == '#INFO':
                df = pd.read_csv(read_path, skiprows = 32, skipinitialspace=True, usecols=rd_cols)
            else: 
                df = pd.read_csv(read_path, usecols=rd_cols)
            
            df.dropna(inplace=True) # removing all the nan rows

            df.reset_index(drop=True, inplace=True) # resetting the index after dropping nan rows
            # df['Timestamp'] = df['Timestamp'].astype('float') # converting the timestamps to float to make the data consistent

            # # creating a list of all timestamps that should have been there if there were no missing datapoints.
            # time_list = ([df.loc[0, 'Timestamp'] + (x * (1000/eeg_sample_rt)) for x in range(0, int((df.loc[df.index[-1], 'Timestamp'] - df.loc[0, 'Timestamp'])/(1000/eeg_sample_rt)) + 1)])
            
            # # creating a dataframe from the time_list that has all the timestamps (missing + not missing)
            # df_eeg = pd.DataFrame(time_list, columns = ['timestamp'])

            # # rounding the timestamps to 1 place decimal as then it would be more easier to compare timestamps!
            # df_eeg['timestamp'] = df_eeg['timestamp'].round(decimals = 1)
            # df_eeg.index = df_eeg['timestamp'] # shifting the timestamps to index

            # df['Timestamp'] = df['Timestamp'].round(decimals = 1)
            # df.index = df['Timestamp']

            # df_new = pd.concat([df_eeg, df], axis = 1)
            # df_new.drop(columns = ['Timestamp'], inplace=True)
            # df_new.reset_index(inplace=True, drop=True)

            # num_drops = df_new['TP9'].isna().sum()

            # if num_drops > len(df_new) * 0.01:
            #     print(xid)
            #     continue

            ## filling the missing values with zeros.
            df_eeg_new = df.fillna(0)
            # break


            ## cleaning eeg signal with Butterworth filter
            df_eeg_new_1 = clean_eeg(df_eeg_new[['TP9', 'AF7', 'AF8', 'TP10']].values, sampling_rate=eeg_sample_rt)
            df_eeg_cleaned = pd.DataFrame(df_eeg_new_1, columns=['ch1', 'ch2', 'ch3', 'ch4'])
            df_eeg_cleaned['timestamp'] = df_eeg_new['Timestamp'].copy()

            csv_path = r'X:\Thesis\matb2\ECG_EDA\{}'.format(subjects_id[sub_id])
          
            mk_dirs(csv_path)
            df_eeg_cleaned.to_csv(os.path.join(csv_path, 'eeg_{}.csv'.format(xid)), index=False)

        except FileNotFoundError:
            # exp_3 for subject 1674 was not recorded :(
            print('error!')
            continue

1026
1105
1106
1175
1194
error!
error!
1337
1390
1400
1419
1517
1544
1624
1629
1674
error!
1688
1717
1765
1818
1892
1929
1933
1953
1981


In [12]:
df_eeg_cleaned

Unnamed: 0,ch1,ch2,ch3,ch4,timestamp
0,135.758575,2.036628,3.553292,22.682076,0.000000
1,65.113795,16.610720,15.704583,60.390435,0.003906
2,2.170126,28.137136,19.565526,92.876265,0.007812
3,-51.303565,34.603756,12.196587,120.033523,0.011719
4,-102.162765,35.771479,-2.295773,147.177143,0.015625
...,...,...,...,...,...
149047,27.577313,-23.311723,2.611338,-27.806060,582.214844
149048,19.888519,-21.132511,3.696378,-29.788446,582.218750
149049,12.197064,-16.676618,3.670895,-30.827665,582.222656
149050,4.813177,-9.991979,2.457262,-31.128539,582.226562


# Processing 

In [13]:
def windowSegments(signal:pd.DataFrame, fs:float, window_size_sec:int, signal_col:str='ecg_'):
    """
    perform cropped signals of window_size seconds for the whole signal
    overlap input is in percentage of window_size
    window_size is in seconds """
    
    window_size = fs * window_size_sec
    start = 0
    counter = 10
    signal.reset_index(inplace=True, drop=False)
    while(start+window_size <= len(signal)):
        signal.loc[start:start+window_size, 'index'] = counter
        start = start + window_size
        counter += 10
    
    return signal[:start+1].copy()

In [14]:
def labelMean(signal:pd.DataFrame, window_size:int):
    """
    perform cropped signals of window_size seconds for the whole signal
    overlap input is in percentage of window_size
    window_size is in seconds """
    
    # start = 0
    # counter = 10
    signal.reset_index(inplace=True, drop=False)
    for x in range(0, 54, 6):
        signal.loc[x:x+6, 'index'] = np.round(signal.iloc[x:x+6]['label'].mean())
    signal.rename(columns={'index':'meanLabel'}, inplace=True)    
    return signal

In [21]:
# combining ECG and EDA and EEG into signle file and then combining the experiments into a single file
# normalize based on subject instead of experiment.

readPath = r'X:/Thesis/matb2/ECG_EDA'
listDirs = os.listdir(readPath)
exp_id = ['exp_0', 'exp_1', 'exp_2', 'exp_3']
labelPath = r'X:\IDEaS_2\MatBII\Data\New_Labels_2'
count = 0
for subs in listDirs:
# for subs in ['1105']:

    subPath = os.path.join(readPath, subs)
    subDirs = os.listdir(subPath)
    print(subs)
    # try:
    dfLabel = pd.read_csv(os.path.join(labelPath, f'{subs}.csv'))
    # break
    dfCombine = pd.DataFrame()
    for xid in exp_id:
        labelColumns = ['time', f'{xid}', f'com_{xid}']
        ecgPath = os.path.join(subPath, f'ecg_{xid}.csv')
        edaPath = os.path.join(subPath, f'eda_{xid}.csv')
        eegPath = os.path.join(subPath, f'eeg_{xid}.csv')

        try:
            dfEcg = pd.read_csv(ecgPath)
            dfEda = pd.read_csv(edaPath)
            dfEeg = pd.read_csv(eegPath)

            # Combining ecg and eda into a single dataframe
            df = pd.concat([dfEcg, dfEda, dfEeg], axis=1)

            # break

            df.dropna(inplace=True)
            df = windowSegments(df, fs=256, window_size_sec=10)
            df.rename(columns={'index':'time'}, inplace=True)
            expLabelDF = dfLabel[labelColumns].copy()
        
            expLabelDF.columns = ['time', 'label', 'complexity']
            expLabelDF = labelMean(expLabelDF, 10)
            df = pd.merge(df.copy(), expLabelDF, on='time')
            df['exp'] = xid
            dfCombine = pd.concat([dfCombine, df], ignore_index=True)
            # break
            csv_path = r'X:\Thesis\matb2\ECG_EDA_EEG_Combined\{}'.format(subs)
            mk_dirs(csv_path)
            df.to_csv(os.path.join(csv_path, '{}.csv'.format(xid)), index=False)
        except FileNotFoundError as e:
            print('File is not present. Skipping to next!')
            continue

    if not dfCombine.empty:
        dfCombine['ECG LL-RA'] = nk.standardize(dfCombine['ECG LL-RA'])
        dfCombine['GSR Conductance CAL'] = nk.standardize(dfCombine['GSR Conductance CAL'])
        dfCombine['EDA_Tonic'] = nk.standardize(dfCombine['EDA_Tonic'])
        dfCombine['EDA_Phasic'] = nk.standardize(dfCombine['EDA_Phasic'])

        dfCombine['ch1'] = nk.standardize(dfCombine['ch1'])
        dfCombine['ch2'] = nk.standardize(dfCombine['ch2'])
        dfCombine['ch3'] = nk.standardize(dfCombine['ch3'])
        dfCombine['ch4'] = nk.standardize(dfCombine['ch4'])

        dfCombine.to_csv(os.path.join(csv_path, '{}.csv'.format(subs)), index=False)

1026
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to next!
1105
1106
1175
1194
1337
1390
1400
1419
1517
File is not present. Skipping to next!
1544
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to next!
1624
File is not present. Skipping to next!
1629
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to next!
1674
File is not present. Skipping to next!
1688
1717
File is not present. Skipping to next!
File is not present. Skipping to next!
1765
File is not present. Skipping to next!
1818
1892
1929
1933
File is not present. Skipping to next!
File is not present. Skipping to next!
1936
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to next!
File is not present. Skipping to

In [22]:
def make_window_for_ECGEDA(signal:np.ndarray, fs:float, overlap:int, window_size_sec:int) -> np.ndarray:
    """ perform cropped signals of window_size seconds for the whole signal
    overlap input is in percentage of window_size
    window_size is in seconds """
    
    window_size = fs * window_size_sec
    overlap     = int(window_size * (overlap / 100))
    start       = 0
    segmented   = np.zeros((1, window_size, signal.shape[1]), dtype = int)
    while(start+window_size <= len(signal)):
        segment     = signal[start:start+window_size]
        segment     = segment.reshape(1, len(segment), signal.shape[1])
        segmented   = np.append(segmented, segment, axis=0)
        start       = start + window_size - overlap
    return segmented[1:]

In [23]:
def make_windows_1min(df:pd.DataFrame, fs:float, overlap:int, window_size_sec:int):
    numSec = 60 #seconds
    start = 0
    windowSize = fs * numSec
    winSizeArr = fs * window_size_sec

    ecgSegments   = np.zeros((1, winSizeArr, 1), dtype = int)
    edaSegments   = np.zeros((1, winSizeArr, 3), dtype = int)
    eegSegments   = np.zeros((1, winSizeArr, 4), dtype = int)

    labelSegments = []

    while(start+windowSize <= len(df)):
        dfOnemin = df[start:windowSize+start]
        start = start + windowSize
        ecgOne = dfOnemin['ECG LL-RA'].values
        ecgOne = np.expand_dims(ecgOne, axis=1)
        edaOne = dfOnemin[['GSR Conductance CAL', 'EDA_Tonic', 'EDA_Phasic']].values
        eegOne = dfOnemin[['ch1', 'ch2', 'ch3', 'ch4']].values

        labelOne = dfOnemin['meanLabel'].mean()
        ecgArr = make_window_for_ECGEDA(ecgOne, fs, overlap, window_size_sec)
        edaArr = make_window_for_ECGEDA(edaOne, fs, overlap, window_size_sec)
        eegArr = make_window_for_ECGEDA(eegOne, fs, overlap, window_size_sec)

        labels = [labelOne] * ecgArr.shape[0]
        ecgSegments = np.append(ecgSegments, ecgArr, axis=0)
        edaSegments = np.append(edaSegments, edaArr, axis=0)
        eegSegments = np.append(eegSegments, eegArr, axis=0)

        labelSegments = labelSegments + labels
    return ecgSegments[1:], edaSegments[1:], eegSegments[1:], labelSegments

In [24]:
mainPath = r'X:\Thesis\matb2\ECG_EDA_EEG_Combined'
listDir = os.listdir(mainPath)
samplingRate=256
numSec = 60 #seconds
overlapValue = 60
windowSegLength = 10
ecgSamples = {}
edaSamples = {}
eegSamples = {}

labelSamples = {}
for subs in listDir:
    csvPath = os.path.join(mainPath, f'{subs}', f'{subs}.csv')
    try:
        dfMain = pd.read_csv(csvPath)
        dfMain.drop(columns=['timestamp'], inplace=True)
        # consider each experiment separately
        grp = dfMain.groupby(by='exp')
        grpList = grp.groups.keys()
        ecgSegs = []
        edaSegs = []
        eegSegs = []

        labelSegs = []
        for grp in grpList:
            df = dfMain.groupby(by='exp').get_group(grp)
            # selecting 1 min of session and creating overlapping samples from makewindow function
            ## selecting 1 min of session
            ecgSegments, edaSegments, eegSegments, labelSegments = make_windows_1min(df, samplingRate, overlapValue, windowSegLength)
            ecgSegs.append(ecgSegments)
            edaSegs.append(edaSegments)
            eegSegs.append(eegSegments)
            labelSegs.append(labelSegments)
        
        ecgSamples[subs] = ecgSegs
        edaSamples[subs] = edaSegs
        eegSamples[subs] = eegSegs

        labelSamples[subs] = labelSegs
            
    except FileExistsError as e:
        print('File Not found!')

In [25]:
path_pickle = r'X:\Thesis\matb2\Processed_Data_1'

mk_dirs(path_pickle)

with open(os.path.join(path_pickle, 'cola_ecg.pickle'), 'wb') as handle:
    pickle.dump(ecgSamples, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(os.path.join(path_pickle, 'cola_eda.pickle'), 'wb') as handle:
    pickle.dump(edaSamples, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(os.path.join(path_pickle, 'cola_eeg.pickle'), 'wb') as handle:
    pickle.dump(eegSamples, handle, protocol = pickle.HIGHEST_PROTOCOL)

with open(os.path.join(path_pickle, 'cola_labels.pickle'), 'wb') as handle:
    pickle.dump(labelSamples, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [28]:
eegSamples['1105'][0].shape

(117, 2560, 4)

In [29]:
ecgSamples['1105'][0].shape

(117, 2560, 1)

In [30]:
edaSamples['1105'][0].shape

(117, 2560, 3)