In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import os

In [2]:
# Step 1: Data Preprocessing
project_path = '/Users/sohamdas/Desktop/EECS 545/Project'
data_path = project_path + '/data'
train_eeg_path = data_path + '/train_eegs'
eeg_file_list = os.listdir(train_eeg_path)
print('Total no. of train EEG files',len(eeg_file_list))

Total no. of train EEG files 17300


In [3]:
# for the whole dataset
#record_times = []
#for i in range(len(eeg_file_list)):
#    filename = eeg_file_list[i]
#    df = pd.read_parquet(train_eeg_path + '/' + filename)
#    record_times.append(df.shape[0])
#print(min(record_times), max(record_times))

In [4]:
# just a subset of 20 samples
short_file_list = eeg_file_list[:20]

record_times = []
for i in range(len(short_file_list)):
    filename = short_file_list[i]
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    record_times.append(df.shape[0])

min_time = min(record_times)
n_graphs = df.shape[1]
n_samples = len(short_file_list)
n_samples

20

In [5]:
# pre-processing
# making the X_train in a particular shape
X_train = np.empty((0, min_time, n_graphs), int)
for i in range(n_samples):
    filename = short_file_list[i]
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    df_balanced = df[:min_time].to_numpy()
    X_train = np.append(X_train, np.array([df_balanced]), axis=0)
    scaler = MinMaxScaler(feature_range=(0, 1))
    df_normalized = scaler.fit_transform(df)
X_train.shape

(20, 10000, 20)

In [6]:
# pre-processing
# making the Y_train in a particular shape
train_data = pd.read_csv(data_path + '/' + 'train.csv')
train_data.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [7]:
unique_eeg_id = np.unique(train_data['eeg_id'])
common_eeg_id = []
for i in range(len(unique_eeg_id)):
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        common_eeg_id.append(i)
print(len(common_eeg_id))
print(len(eeg_file_list))

17089
17300


In [8]:
unique_eeg_id = np.unique(train_data['eeg_id'])
var_of_interest = ['eeg_id','seizure_vote,''lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
vote_vector = ['seizure_vote','lpd_vote','gpd_vote','lrda_vote','grda_vote','other_vote']
n_disease = len(vote_vector)

In [9]:
Y_train = np.empty((0, n_disease), int)
n_samples = 20  #len(unique_eeg_id)
for i in range(n_samples):
    ID = unique_eeg_id[i]
    if str(ID) + '.parquet' in eeg_file_list:
        row_loc = train_data.index[train_data['eeg_id'] == ID][0]
        vote_count = train_data.loc[row_loc,vote_vector]
        if np.sum(np.isnan(vote_count.astype(float)), axis=0) > 0:
            raise ValueError("NaN value encountered")
        vote_share = vote_count / sum(vote_count)
        Y_train = np.append(Y_train, np.array([vote_share]),axis=0)
#print(Y_train)

In [10]:
X = np.empty((0, min_time, n_graphs), float)
for i in range(n_samples):
    filename = str(unique_eeg_id[i]) + '.parquet'
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    df_balanced = df[:min_time].to_numpy()
    if np.sum(np.isnan(df_balanced.astype(float)), axis=(0,1)) > 0:    
        raise ValueError("NaN value encountered")
X_reshaped = X.reshape(-1, df.shape[-1])

# Initialize and apply MinMaxScaler for reshaped dataset
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X_reshaped)

# Reshape back to 3D (n_samples, n_time_steps, n_features)
X_scaled_3d = X_scaled.reshape(X.shape)
X = X_scaled_3d

ValueError: NaN value encountered

In [None]:
print(X.shape)
print(Y.shape)
X = X.astype('float32')
Y = Y.astype('float32')

In [None]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
X_test.shape

In [None]:
# Step 2: Building the LSTM Model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50))
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Step 3: Training the Model
model.fit(X_train, y_train, epochs=2, validation_split=0.2)

# Step 4: Making Predictions
predictions = model.predict(X_test)

In [None]:
predictions

In [None]:
y_test

In [None]:
#print(model.predict(X_train))
#print(y_train)

In [None]:

all_data = []
target_votes = [] # Assuming this is how you store expert votes
target_probabilities = []

# Function to load data and convert votes to probabilities
def load_data_and_votes(file_path):
    df = pd.read_parquet(file_path)
    # Assuming the DataFrame df has columns 'EEG_data' for EEG signals and 'Disease1', ... 'Disease6' for votes
    
    # Preprocess EEG data here (e.g., normalization, truncation/padding)
    # Placeholder for actual EEG data preprocessing
    
    # Convert votes to probabilities
    vote_columns = ['Disease1', 'Disease2', 'Disease3', 'Disease4', 'Disease5', 'Disease6']
    votes = df[vote_columns].sum(axis=0)  # Sum votes for each disease
    total_votes = votes.sum()  # Total votes across all diseases
    probabilities = votes / total_votes  # Convert to probabilities
    
    return processed_eeg_data, probabilities  # Placeholder for actual EEG data and probabilities

# Assuming a file path
file_path = '/path/to/your/data_eeg/patient_file.parquet'
processed_eeg_data, target_probabilities = load_data_and_votes(file_path)

# Continue with data splitting, model definition, training, and prediction as described earlier

for file in file_list:
    # Load data
    df = pd.read_parquet(os.path.join(data_folder, file))
    # Preprocess (truncate/pad, normalize)
    # Assuming df has columns for data and maybe a separate file or column for votes
    # Normalize data
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(df.values) # Assuming df.values is the EEG data
    all_data.append(scaled_data)
    # Load votes and convert to probabilities
    # This part needs more information about how votes are stored

# Uniform time series length
# This is a placeholder function to make all data sequences have the same length
def uniform_length(data, sequence_length=1000):
    return np.array([np.pad(d, (0, max(0, sequence_length - len(d))), 'constant', constant_values=0)[:sequence_length] for d in data])

all_data_uniform = uniform_length(all_data)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(all_data_uniform, target_votes, test_size=0.2)

# Step 2: Building the LSTM Model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50))
model.add(Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 3: Training the Model
model.fit(X_train, y_train, epochs=10, validation_split=0.1)

# Step 4: Making Predictions
predictions = model.predict(X_test)

# Further developments are discussed in the text above.


In [None]:
X_train = np.empty((0, min_time, n_graphs), float)
for i in range(n_samples):
    filename = str(unique_eeg_id[i]) + '.parquet'
    df = pd.read_parquet(train_eeg_path + '/' + filename)
    df_balanced = df[:min_time].to_numpy()
    if np.sum(np.isnan(df_balanced.astype(float)), axis=(0,1)) > 0:    
        raise ValueError("NaN value encountered")
    X_train = np.append(X_train, np.array([df_balanced]), axis=0)
    scaler = MinMaxScaler(feature_range=(0, 1))
    df_normalized = scaler.fit_transform(df)
np.sum(np.isnan(X_train),axis=(1,2))