In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import time

In [2]:
# Step 1: Data Preprocessing
project_path = '/Users/sohamdas/Desktop/EECS 545/Project'
data_path = project_path + '/data'
train_eeg_path = data_path + '/train_eegs'
eeg_file_list = os.listdir(train_eeg_path)
print('Total no. of train EEG files',len(eeg_file_list))

Total no. of train EEG files 17300


In [3]:
# for the whole dataset
#record_times = []
#for i in range(len(eeg_file_list)):
#    filename = eeg_file_list[i]
#    df = pd.read_parquet(train_eeg_path + '/' + filename)
#    record_times.append(df.shape[0])
#print(min(record_times), max(record_times))

In [4]:
# just a subset of 20 samples
short_file_list = eeg_file_list[:20]

record_times = []
for i in range(len(short_file_list)):
    filename = short_file_list[i]
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    record_times.append(df.shape[0])

min_time = min(record_times)
n_graphs = df.shape[1]
n_samples = len(short_file_list)
n_samples

20

In [5]:
# pre-processing
# making the Y_train in a particular shape
data = pd.read_csv(data_path + '/' + 'train.csv')
data.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [6]:
unique_eeg_id = np.unique(data['eeg_id'])
common_eeg_id = []
for i in range(len(unique_eeg_id)):
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        common_eeg_id.append(i)
print(len(common_eeg_id))
print(len(eeg_file_list))

17089
17300


In [7]:
unique_eeg_id = np.unique(data['eeg_id'])
var_of_interest = ['eeg_id','seizure_vote,''lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
vote_vector = ['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
n_disease = len(vote_vector)

In [8]:
def convert_to_one_hot(y_votes):
    # Find the indices of max votes for each row (patient)
    indices_of_max_votes = np.argmax(y_votes, axis=1)
    
    # Convert indices to one-hot encoded format
    num_samples = y_votes.shape[0]
    num_classes = y_votes.shape[1]
    one_hot_encoded = np.zeros((num_samples, num_classes))
    one_hot_encoded[np.arange(num_samples), indices_of_max_votes] = 1
    
    return one_hot_encoded

In [9]:
n_samples = 100  #len(unique_eeg_id)

In [10]:
%%time
X = np.empty((0, min_time, n_graphs), float)
Y = np.empty((0, n_disease), int)
count = 0
i = 0
while count < n_samples:
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        filename = str(ID) + '.parquet'
        df = pd.read_parquet(train_eeg_path + '/' + filename)
        df_cropped = df[:min_time].to_numpy()
        row_loc = data.index[data['eeg_id'] == ID][0]
        vote_count = data.loc[row_loc,vote_vector]
        vote_share = vote_count / sum(vote_count)
        if np.sum(np.isnan(df_cropped.astype(float)), axis=(0,1)) == 0 and np.sum(np.isnan(vote_count.astype(float)), axis=0) == 0:
            X = np.append(X, np.array([df_cropped]), axis=0)
            Y = np.append(Y, np.array([vote_share]),axis=0)
            count = count + 1
    i = i + 1

# Initialize and apply MinMaxScaler for reshaped dataset
X_reshaped = X.reshape(-1, df.shape[-1])

scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X_reshaped)

# Reshape back to 3D (n_samples, n_time_steps, n_features)
X_scaled_3d = X_scaled.reshape(X.shape)
X = X_scaled_3d

CPU times: user 848 ms, sys: 942 ms, total: 1.79 s
Wall time: 1.83 s


In [11]:
print(X.shape)
print(Y.shape)
X = X.astype('float32')
Y = Y.astype('float32')
print(np.sum(np.isnan(X), axis=(0,1,2)))
print(np.sum(np.isnan(Y), axis=(0,1)))

(100, 10000, 20)
(100, 6)
0
0


In [12]:
np.max(X, axis = (0,1))

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.], dtype=float32)

In [13]:
# One-hot encoding
y_one_hot = convert_to_one_hot(Y)

print("One-hot encoded Y-values:\n", y_one_hot)
Y = y_one_hot

One-hot encoded Y-values:
 [[0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [1. 0.

In [14]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [15]:
# Step 2: Building the LSTM Model
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(64))
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [16]:
%%time
# Step 3: Training the Model
model.fit(X_train, y_train, epochs=10, validation_split=0.2)  # original epoch=10

# Step 4: Making Predictions
predictions = model.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 2min 6s, sys: 7.45 s, total: 2min 14s
Wall time: 2min 10s


In [17]:
(np.max(predictions, axis=0), np.min(predictions, axis=0))

(array([0.25198907, 0.14489606, 0.03383675, 0.04627585, 0.10941901,
        0.41447362], dtype=float32),
 array([0.2515143 , 0.14473267, 0.03370412, 0.0461162 , 0.10929359,
        0.41371346], dtype=float32))

In [18]:
predictions

array([[0.25192586, 0.14486282, 0.03380812, 0.0462142 , 0.10937374,
        0.41381517],
       [0.25188294, 0.14484444, 0.03378657, 0.04618847, 0.10933685,
        0.41396067],
       [0.25189573, 0.14489606, 0.03380404, 0.04623157, 0.10937873,
        0.41379395],
       [0.2515143 , 0.14473267, 0.03378981, 0.04625406, 0.10941901,
        0.4142902 ],
       [0.2516178 , 0.14479472, 0.03370412, 0.0461162 , 0.10929359,
        0.41447362],
       [0.25177652, 0.14488521, 0.03383675, 0.04627585, 0.10940071,
        0.4138249 ],
       [0.25186563, 0.1448487 , 0.03380152, 0.0462072 , 0.10936482,
        0.41391212],
       [0.25190297, 0.14489146, 0.03382515, 0.04624064, 0.10939583,
        0.41374397],
       [0.25193143, 0.14488941, 0.03382585, 0.04624074, 0.10939914,
        0.41371346],
       [0.25188902, 0.1448654 , 0.0338122 , 0.04620683, 0.10939348,
        0.4138331 ],
       [0.2518895 , 0.14482866, 0.03378292, 0.04618658, 0.1093457 ,
        0.41396663],
       [0.25191474, 0

In [19]:
y_test

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.]])