## First stage of preprocessing

* Signal from edf files is resampled at 250 Hz
* Signals are split into non overlapping windows of 10 seconds
* The individual window data for each channel is stored in individual csv files under `/eeg_data_resampled`
* Cumulative summary of all these csv files along with labels stored in `resampled.csv` file 

In [1]:
import numpy as np
import pandas as pd
import os
import mne
import scipy.signal as sp
import json

In [2]:
RESAMPLING_FREQUENCY = 250
SECONDS = 10

dataset = pd.read_csv('../Data/dataset.csv')
dataset = dataset.iloc[1:, :15]
SZ_COUNT = dict()

dataset.shape

(6107, 15)

In [3]:
def log(string):
    log = open('preprocess_1.log', 'a')
    log.write(string)
    log.close()

In [4]:
def handleRow(row, output_csv):
    if str(dataset.iloc[row][3]) != str(np.nan) :
        PATIENT = str(int(dataset.iloc[row][2]))
        SEIZURE_TYPE = dataset.iloc[row][14] if str(dataset.iloc[row][14]) != str(np.nan) else 'BCKG'
        SEIZURES = []
        CHANNELS = []
        SEIZURE_OCCUR = []
        SEIZURE_TYPES = []
        DATAPOINTS = []
        current_row = row
        while(current_row < int(dataset.shape[0]) and (str(dataset.iloc[current_row][2]) == str(np.nan) or str(int(dataset.iloc[current_row][2])) == PATIENT )):
            if str(dataset.iloc[current_row][12]) != 'nan':
                SEIZURES.append([float(dataset.iloc[current_row][12]),float(dataset.iloc[current_row][13])]) 
            current_row += 1
            filepath = dataset.iloc[row][11]
            filepath = (os.getcwd() +'/../Data/dataset/the-tuh-eeg-seizure-corpus-tusz-v152/edf/'+ filepath[1:])[:-3]+'edf'
            filepath = os.path.abspath(filepath)

            data = mne.io.read_raw_edf(filepath, preload = True)
            channels = data.ch_names
            INCLUDED_CHANNELS = ['EEG FP1','EEG FP2','EEG F3','EEG F4','EEG C3','EEG C4','EEG P3','EEG P4','EEG O1','EEG O2','EEG F7','EEG F8','EEG T3','EEG T4','EEG T5','EEG T6','EEG FZ','EEG CZ','EEG PZ']
            DATA = {}
            TO_JSON = {PATIENT: []}
            for i in range(len(data.get_data()[0])//(RESAMPLING_FREQUENCY*SECONDS)):
                DATAPOINTS.append([])
                SEIZURE_OCCUR.append(0)
                SEIZURE_TYPES.append('BCKG')
            for chan in range(len(channels)):

                if channels[chan].split('-')[0] in INCLUDED_CHANNELS:
                    CHANNELS.append(channels[chan])
                    # Resample at 250 hz
                    DATA[chan] = data.get_data()[chan]
                    secs = len(DATA[chan])/float(data.info['sfreq'])
                    samps = secs*RESAMPLING_FREQUENCY
                    DATA[chan] = sp.resample(DATA[chan], int(samps))
                    start = 0
                    indx = 0
                    while start < len(DATA[chan]):
                        if len(DATA[chan]) - start < RESAMPLING_FREQUENCY*SECONDS: 
                            break 
                        else :
                            end = start + RESAMPLING_FREQUENCY*SECONDS
                            seizure = 0
                            for sz in SEIZURES:
                                if (sz[0]*RESAMPLING_FREQUENCY>=start and sz[0]*RESAMPLING_FREQUENCY<end) or (sz[1]*RESAMPLING_FREQUENCY>=start and sz[1]*RESAMPLING_FREQUENCY<end):
                                    seizure = 1
                            if seizure == 1 :
                                SEIZURE_OCCUR[indx] ^= 1
                                SEIZURE_TYPES[indx] = SEIZURE_TYPE
                            DATAPOINTS[indx].append(DATA[chan][start:end])

                            start += RESAMPLING_FREQUENCY*SECONDS
                            indx += 1
            df = pd.read_csv(output_csv)
            temp = None
            for data in range(len(DATAPOINTS)) :
                log(str("Window "+str(data)+" of signal "+str(current_row-1)+"\n"))
                dfi = pd.DataFrame()
                for ch in range(len(CHANNELS)):
                    if len(DATAPOINTS[0][ch]):
                        dfi[CHANNELS[ch].split('-')[0]] = DATAPOINTS[data][ch]
                if SEIZURE_TYPES[data] in SZ_COUNT and SZ_COUNT[SEIZURE_TYPES[data]]<10000:
                    SZ_COUNT[SEIZURE_TYPES[data]] += 1
                    df.loc[len(df.index)] = [PATIENT, '../Data/eeg_data_resampled/'+str(int(PATIENT))+'_'+str(data)+'.csv', SEIZURE_OCCUR[data], SEIZURE_TYPES[data]]
                    dfi.to_csv('../Data/eeg_data_resampled/'+str(int(PATIENT))+'_'+str(data)+'.csv', index=False)
                    df.to_csv(output_csv, index=False)    
                elif SEIZURE_TYPES[data] not in SZ_COUNT:
                    SZ_COUNT[SEIZURE_TYPES[data]] = 1
                    df.loc[len(df.index)] = [PATIENT, '../Data/eeg_data_resampled/'+str(int(PATIENT))+'_'+str(data)+'.csv', SEIZURE_OCCUR[data], SEIZURE_TYPES[data]]
                    dfi.to_csv('../Data/eeg_data_resampled/'+str(int(PATIENT))+'_'+str(data)+'.csv', index=False)
                    df.to_csv(output_csv, index=False)        
        return current_row
    return row+1

In [5]:
# Handling train dataset
row = 0
#row = int(input("Enter starting row: "))
while row < int(dataset.shape[0]):
    rowcopy = row
    try:
        skip = handleRow(row, '../Data/resampled.csv')
        row = skip
        log("==========Processed row "+ str(row-1)+"\n")
    except:
        log("=========Ignored row "+ str(row)+"\n")
        row = rowcopy+1

Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\train\02_tcp_le\014\00001402\s003_2008_07_21\00001402_s003_t001.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 315999  =      0.000 ...  1263.996 secs...
Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\train\02_tcp_le\014\00001479\s001_2004_06_14\00001479_s001_t001.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 299749  =      0.000 ...  1198.996 secs...
Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\train\02_tcp_le\014\00001413\s001_2004_07_13\00001413_s001_t000.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 301499  =      0.000 ...  1205.996 secs...
Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\train\02_tcp_le\014

In [6]:
# Handling Dev dataset and merging it into a single larger dataset
dataset = pd.read_csv('../Data/dev_dataset.csv')
row = 0
SZ_COUNT = dict()
#row = int(input("Enter starting row: "))
while row < int(dataset.shape[0]):
    rowcopy = row
    try:
        skip = handleRow(row, './Data/resampled_test.csv')
        row = skip
        log("=========Processed row "+ str(row-1)+"\n")
    except:
        log("=========Ignored row "+ str(row)+"\n")
        row = rowcopy+1


Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\dev\01_tcp_ar\002\00000258\s002_2003_07_21\00000258_s002_t000.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 7999  =      0.000 ...    19.997 secs...
Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\dev\01_tcp_ar\002\00000258\s003_2003_07_22\00000258_s003_t000.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 93599  =      0.000 ...   233.998 secs...
Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\dev\01_tcp_ar\006\00000629\s003_2003_07_23\00000629_s003_t000.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 73999  =      0.000 ...   184.998 secs...
Extracting EDF parameters from E:\EEG_PROJECT\the-tuh-eeg-seizure-corpus-tusz-v152\edf\dev\01_tcp_ar\006\00000629\s0

In [7]:
# df = df.drop_duplicates(keep=False)


In [8]:
resampled = pd.read_csv('../data/resampled.csv')
seizures = {}
for data in range(int(resampled.shape[0])):
    if not resampled.iloc[data][3] in seizures:
        seizures[resampled.iloc[data][3]] = 1
    else :
        seizures[resampled.iloc[data][3]] += 1
for data in seizures:
    print(data, str(seizures[data]))
resampled.to_csv('../data/resampled_backup.csv', index=False)

BCKG 10000
FNSZ 463
ABSZ 49
CPSZ 137
TCSZ 16
GNSZ 204
MYSZ 4
SPSZ 10
TNSZ 4


In [9]:
resampled = pd.read_csv('../data/resampled_test.csv')
seizures = {}
for data in range(int(resampled.shape[0])):
    if not resampled.iloc[data][3] in seizures:
        seizures[resampled.iloc[data][3]] = 1
    else :
        seizures[resampled.iloc[data][3]] += 1
for data in seizures:
    print(data, str(seizures[data]))
resampled.to_csv('../data/resampled_backup_test.csv', index=False)

BCKG 10000
CPSZ 45
GNSZ 57
FNSZ 124
MYSZ 2
TNSZ 4
ABSZ 32
TCSZ 4
