## Second stage of preprocessing

* Extraction of data from individual records and saving them into numpy arrays
* Feature extraction based on PSD of all sensors

In [1]:
import numpy as np
import pandas as pd

In [2]:
resampled = pd.read_csv('../Data/resampled.csv')
seizures = {}
resampled

Unnamed: 0,Patient,Filepath,Seizure,Seizure Type
0,1402,../../eeg_data_resampled/1402_0.csv,0,BCKG
1,1402,../../eeg_data_resampled/1402_1.csv,0,BCKG
2,1402,../../eeg_data_resampled/1402_2.csv,0,BCKG
3,1402,../../eeg_data_resampled/1402_3.csv,0,BCKG
4,1402,../../eeg_data_resampled/1402_4.csv,0,BCKG
...,...,...,...,...
10882,5426,../../eeg_data_resampled/5426_21.csv,1,FNSZ
10883,5426,../../eeg_data_resampled/5426_12.csv,1,FNSZ
10884,5426,../../eeg_data_resampled/5426_20.csv,1,FNSZ
10885,2380,../../eeg_data_resampled/2380_4.csv,1,GNSZ


In [3]:

for data in range(int(resampled.shape[0])):
    if not resampled.iloc[data][3] in seizures.keys():
        seizures[resampled.iloc[data][3]] = 1
    else :
        seizures[resampled.iloc[data][3]] += 1
for data in seizures:
    print(data, str(seizures[data]))

BCKG 10000
FNSZ 463
ABSZ 49
CPSZ 137
TCSZ 16
GNSZ 204
MYSZ 4
SPSZ 10
TNSZ 4


In [4]:
resampled_test = pd.read_csv('../Data/resampled_test.csv')
seizures_test = {}
resampled_test

Unnamed: 0,Patient,Filepath,Seizure,Seizure Type
0,258,../../eeg_data_resampled/258_0.csv,0,BCKG
1,258,../../eeg_data_resampled/258_1.csv,0,BCKG
2,258,../../eeg_data_resampled/258_0.csv,0,BCKG
3,258,../../eeg_data_resampled/258_1.csv,0,BCKG
4,258,../../eeg_data_resampled/258_2.csv,0,BCKG
...,...,...,...,...
10263,2297,../../eeg_data_resampled/2297_23.csv,1,CPSZ
10264,6546,../../eeg_data_resampled/6546_29.csv,1,TCSZ
10265,6546,../../eeg_data_resampled/6546_36.csv,1,TCSZ
10266,6546,../../eeg_data_resampled/6546_29.csv,1,FNSZ


In [5]:

for data in range(int(resampled_test.shape[0])):
    if not resampled_test.iloc[data][3] in seizures_test:
        seizures_test[resampled_test.iloc[data][3]] = 1
    else :
        seizures_test[resampled_test.iloc[data][3]] += 1
for data in seizures_test:
    print(data, str(seizures_test[data]))

BCKG 10000
CPSZ 45
GNSZ 57
FNSZ 124
MYSZ 2
TNSZ 4
ABSZ 32
TCSZ 4


In [6]:
resampled.describe()

Unnamed: 0,Patient,Seizure
count,10887.0,10887.0
mean,3657.601727,0.060531
std,2505.595924,0.238479
min,2.0,0.0
25%,1357.0,0.0
50%,4208.0,0.0
75%,5672.0,0.0
max,13145.0,1.0


In [7]:
from collections import OrderedDict

# accepts PSD of all sensors, returns band power for all sensors
def get_brain_waves_power(psd_welch, freqs):

	brain_waves = OrderedDict({
		"delta" : [1.0, 4.0],
		"theta": [4.0, 7.5],
		"alpha": [7.5, 13.0],
		"lower_beta": [13.0, 16.0],
		"higher_beta": [16.0, 30.0],
		"gamma": [30.0, 40.0]
	})
	band_powers = np.zeros((psd_welch.shape[0], 6))
	for wave_idx, wave in enumerate(brain_waves.keys()):
		if wave_idx == 0:
			band_freqs_idx = np.argwhere((freqs <= brain_waves[wave][1]))
		else:
			band_freqs_idx = np.argwhere((freqs >= brain_waves[wave][0]) & (freqs <= brain_waves[wave][1]))
		band_psd = psd_welch[:, band_freqs_idx.ravel()]
		total_band_power = np.sum(band_psd, axis=1)
		band_powers[:, wave_idx] = total_band_power    
	return band_powers

In [8]:
from asyncio.subprocess import SubprocessStreamProtocol
from mne.time_frequency import psd_array_welch

feature_matrix = []
y = []
for window_idx in range(int(resampled.shape[0])):

    innerdata = pd.read_csv('../'+resampled.iloc[window_idx][1])
    window_data = innerdata.to_numpy()
    
    # PSD NODE FEATURES - derive total power in 6 brain rhythm bands for each montage channel
    psd_welch, freqs = psd_array_welch(window_data.transpose(), sfreq=250, fmin = 0.5, fmax=50.0, n_per_seg=100, 
                                       average='mean', verbose=False)
    # Convert power to dB scale.
    psd_welch = 10 * np.log10(psd_welch)
    band_powers = get_brain_waves_power(psd_welch, freqs)
    assert band_powers.shape == (19, 6)
    
    # flatten all features, and save to feature matrix at appropriate index
    feature_matrix.append(band_powers.flatten())
    y.append(resampled['Seizure Type'][window_idx])

# save the features and labels as numpy array to disk
np.save("../Data/saved_numpy_arrays/X_psd_epilepsy_corpus.npy", np.array(feature_matrix))
np.save("../Data/saved_numpy_arrays/y_epilepsy_corpus.npy", np.array(y))

print ("\nALL ARRAYS SAVED TO DISK!...\n")

FileNotFoundError: [Errno 2] No such file or directory: '../../../eeg_data_resampled/1402_0.csv'

In [None]:
feature_matrix_test = []
y = []
for window_idx in range(int(resampled_test.shape[0])):
    #print("Line "+ str(window_idx) + ' of '+str(resampled_test.shape[0]))
    #print(resampled_test.iloc[window_idx][1])
    innerdata = pd.read_csv('../'+resampled_test.iloc[window_idx][1])
    window_data = innerdata.to_numpy()
    
    # PSD NODE FEATURES - derive total power in 6 brain rhythm bands for each montage channel
    psd_welch, freqs = psd_array_welch(window_data.transpose(), sfreq=250, fmin = 0.5, fmax=50.0, n_per_seg=100, 
                                       average='mean', verbose=False)
    # Convert power to dB scale.
    psd_welch = 10 * np.log10(psd_welch)
    band_powers = get_brain_waves_power(psd_welch, freqs)
    assert band_powers.shape == (19, 6)
    # flatten all features, and save to feature matrix at appropriate index
    feature_matrix_test.append(band_powers.flatten())
    y.append(resampled_test['Seizure Type'][window_idx])
            
np.save("../Data/saved_numpy_arrays/X_test_psd_epilepsy_corpus.npy", np.array(feature_matrix_test))
np.save("../Data/saved_numpy_arrays/y_test_epilepsy_corpus.npy", np.array(y))

print ("\nALL ARRAYS SAVED TO DISK!...\n")