In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os
import time

In [2]:
# Step 1: Data Preprocessing
project_path = '/Users/sohamdas/Desktop/EECS 545/Project'
data_path = project_path + '/data'
train_eeg_path = data_path + '/train_eegs'
eeg_file_list = os.listdir(train_eeg_path)
print('Total no. of train EEG files',len(eeg_file_list))

Total no. of train EEG files 17300


In [3]:
# for the whole dataset
#record_times = []
#for i in range(len(eeg_file_list)):
#    filename = eeg_file_list[i]
#    df = pd.read_parquet(train_eeg_path + '/' + filename)
#    record_times.append(df.shape[0])
#print(min(record_times), max(record_times))

In [4]:
# just a subset of 20 samples
short_file_list = eeg_file_list[:20]

record_times = []
for i in range(len(short_file_list)):
    filename = short_file_list[i]
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    record_times.append(df.shape[0])

min_time = min(record_times)
n_graphs = df.shape[1]
n_samples = len(short_file_list)
n_samples

20

In [5]:
# pre-processing
# making the X_train in a particular shape
X_train = np.empty((0, min_time, n_graphs), int)
for i in range(n_samples):
    filename = short_file_list[i]
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    df_balanced = df[:min_time].to_numpy()
    X_train = np.append(X_train, np.array([df_balanced]), axis=0)
    scaler = MinMaxScaler(feature_range=(0, 1))
    df_normalized = scaler.fit_transform(df)
X_train.shape

(20, 10000, 20)

In [6]:
# pre-processing
# making the Y_train in a particular shape
data = pd.read_csv(data_path + '/' + 'train.csv')
data.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [7]:
unique_eeg_id = np.unique(data['eeg_id'])
common_eeg_id = []
for i in range(len(unique_eeg_id)):
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        common_eeg_id.append(i)
print(len(common_eeg_id))
print(len(eeg_file_list))

17089
17300


In [8]:
unique_eeg_id = np.unique(data['eeg_id'])
var_of_interest = ['eeg_id','seizure_vote,''lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
vote_vector = ['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
n_disease = len(vote_vector)

In [40]:
n_samples = 50  #len(unique_eeg_id)

In [41]:
%%time
X = np.empty((0, min_time, n_graphs), float)
Y = np.empty((0, n_disease), int)
count = 0
i = 0
while count < n_samples:
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        filename = str(ID) + '.parquet'
        df = pd.read_parquet(train_eeg_path + '/' + filename)
        df_cropped = df[:min_time].to_numpy()
        row_loc = data.index[data['eeg_id'] == ID][0]
        vote_count = data.loc[row_loc,vote_vector]
        vote_share = vote_count / sum(vote_count)
        if np.sum(np.isnan(df_cropped.astype(float)), axis=(0,1)) == 0 and np.sum(np.isnan(vote_count.astype(float)), axis=0) == 0:
            X = np.append(X, np.array([df_cropped]), axis=0)
            Y = np.append(Y, np.array([vote_share]),axis=0)
            count = count + 1
    i = i + 1

# Initialize and apply MinMaxScaler for reshaped dataset
X_reshaped = X.reshape(-1, df.shape[-1])

scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X_reshaped)

# Reshape back to 3D (n_samples, n_time_steps, n_features)
X_scaled_3d = X_scaled.reshape(X.shape)
X = X_scaled_3d

CPU times: user 313 ms, sys: 348 ms, total: 661 ms
Wall time: 712 ms


In [42]:
print(X.shape)
print(Y.shape)
X = X.astype('float32')
Y = Y.astype('float32')
print(np.sum(np.isnan(X), axis=(0,1,2)))
print(np.sum(np.isnan(Y), axis=(0,1)))

(50, 10000, 20)
(50, 6)
0
0


In [43]:
np.max(X, axis = (0,1))

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1.], dtype=float32)

In [44]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [45]:
# Step 2: Building the LSTM Model
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(64))
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [46]:
%%time
# Step 3: Training the Model
model.fit(X_train, y_train, epochs=10, validation_split=0.2)  # original epoch=10

# Step 4: Making Predictions
predictions = model.predict(X_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 54.2 s, sys: 3.3 s, total: 57.5 s
Wall time: 55 s


In [47]:
(np.max(predictions, axis=0), np.min(predictions, axis=0))

(array([0.20860668, 0.0683241 , 0.05895022, 0.03700878, 0.20823939,
        0.4231102 ], dtype=float32),
 array([0.20790426, 0.06779182, 0.05840839, 0.03590621, 0.20687918,
        0.41907737], dtype=float32))

In [48]:
np.mean(y_train, axis=0)

array([0.17858973, 0.08638611, 0.06625   , 0.09086996, 0.18305944,
       0.39484474], dtype=float32)

In [49]:
model.weights

[<tf.Variable 'lstm_4/lstm_cell/kernel:0' shape=(20, 256) dtype=float32, numpy=
 array([[-0.07854074, -0.06218591,  0.0506952 , ...,  0.08428239,
         -0.11601783,  0.05399835],
        [-0.00113668, -0.14251389, -0.13603806, ...,  0.04259643,
          0.03461111,  0.04321364],
        [-0.10886644, -0.01317177, -0.1389679 , ..., -0.10029317,
         -0.05780034,  0.1522397 ],
        ...,
        [-0.01595629,  0.10611767,  0.01880218, ..., -0.04942065,
          0.030395  , -0.12319996],
        [-0.03267483, -0.09957429, -0.02752098, ..., -0.10108507,
         -0.02910259,  0.0346928 ],
        [-0.0947166 ,  0.03049864,  0.07776181, ...,  0.03731306,
          0.04280503,  0.07893627]], dtype=float32)>,
 <tf.Variable 'lstm_4/lstm_cell/recurrent_kernel:0' shape=(64, 256) dtype=float32, numpy=
 array([[-0.0087424 ,  0.09358511, -0.02565363, ..., -0.00478373,
         -0.08173485, -0.05154434],
        [ 0.07620607, -0.01868488,  0.03912338, ...,  0.08164537,
          0.0331189

In [None]:
Y = np.empty((0, n_disease), int)
for i in range(n_samples):
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        row_loc = train_data.index[train_data['eeg_id'] == ID][0]
        vote_count = train_data.loc[row_loc,vote_vector]
        if np.sum(np.isnan(vote_count.astype(float)), axis=0) > 0:
            raise ValueError("NaN value encountered")
        vote_share = vote_count / sum(vote_count)
        Y = np.append(Y, np.array([vote_share]),axis=0)