# Training and Model Saving Notebook

## 1. Import necessary libraries

In [6]:
#Set up the notebook environment
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scipy
from scipy.stats import pearsonr
from scipy import signal as sig

## 2. Read Data

In [7]:
raw = scipy.io.loadmat('./datasets/raw_training_data.mat')
data_glove_1 = raw['train_dg'][0][0]
data_glove_1 = np.delete(data_glove_1, 3, 1)
data_glove_2 = raw['train_dg'][1][0]
data_glove_2 = np.delete(data_glove_2, 3, 1)
data_glove_3 = raw['train_dg'][2][0]
data_glove_3 = np.delete(data_glove_3, 3, 1)

ecog_1 = raw['train_ecog'][0][0]
ecog_2 = raw['train_ecog'][1][0]
ecog_3 = raw['train_ecog'][2][0]

labels_1 = np.argmax(data_glove_1, axis=1)
labels_2 = np.argmax(data_glove_2, axis=1)
labels_3 = np.argmax(data_glove_3, axis=1)




## 2. Filter data

In [8]:
def filter_data(raw_eeg, fs=1000):
    """
    Write a filter function to clean underlying data.
    Filter type and parameters are up to you. Points will be awarded for reasonable filter type, parameters and application.
    Please note there are many acceptable answers, but make sure you aren't throwing out crucial data or adversly
    distorting the underlying data!

    Input: 
        raw_eeg (samples x channels): the raw signal
        fs: the sampling rate (1000 for this dataset)
    Output: 
        clean_data (samples x channels): the filtered signal
    """
    dim = 100
    # b = sig.firwin(numtaps=dim + 1, cutoff=[0.15, 200], pass_zero='bandpass', fs=fs)
    b, a = sig.butter(N=2, Wn=[0.15, 200], btype='bandpass', fs=fs, output='ba')
    filtered_eeg = sig.filtfilt(b, a, x=raw_eeg, axis=0)
    
    return filtered_eeg

## 3. Train test split and feature engineering

In [9]:
train_test_ratio = 0.7

ecog_1_train = ecog_1[:int(train_test_ratio * ecog_1.shape[0])]
ecog_1_test = ecog_1[int(train_test_ratio * ecog_1.shape[0]):]
data_glove_1_train = data_glove_1[:int(train_test_ratio * data_glove_1.shape[0])]
data_glove_1_test = data_glove_1[int(train_test_ratio * data_glove_1.shape[0]):]

ecog_2_train = ecog_2[:int(train_test_ratio * ecog_2.shape[0])]
ecog_2_test = ecog_2[int(train_test_ratio * ecog_2.shape[0]):]
data_glove_2_train = data_glove_2[:int(train_test_ratio * data_glove_2.shape[0])]
data_glove_2_test = data_glove_2[int(train_test_ratio * data_glove_2.shape[0]):]

ecog_3_train = ecog_3[:int(train_test_ratio * ecog_3.shape[0])]
ecog_3_test = ecog_3[int(train_test_ratio * ecog_3.shape[0]):]
data_glove_3_train = data_glove_3[:int(train_test_ratio * data_glove_3.shape[0])]
data_glove_3_test = data_glove_3[int(train_test_ratio * data_glove_3.shape[0]):]


print(f"""
Before splitting: {ecog_1.shape[0]} samples
After splitting: {ecog_1_train.shape[0]} training samples and {ecog_1_test.shape[0]} testing samples
""")


Before splitting: 300000 samples
After splitting: 210000 training samples and 90000 testing samples



In [10]:
def NumWins(x, fs, winLen, winDisp):
    return int(1 + (x.shape[0] - winLen * fs) / (winDisp * fs))

winLen = 200 / 1e3
winOverlap = 40 / 1e3
winDisp = winLen - winOverlap
NumWins(ecog_1, 1000, winLen, winDisp)

1874

In [99]:
def LineLength(x):
    return np.abs(np.diff(x, axis=0)).sum(axis=0)

def Area(x):
    return np.abs(x).sum(axis=0)

def Energy(x):
    return (x ** 2).sum(axis=0)

def ZeroCrossingMean(x):
    return ((x < x.mean(axis=0))[1:] & (x[:-1] > x.mean(axis=0)) | (x > x.mean(axis=0))[1:] & (x[:-1] < x.mean(axis=0))).sum(axis=0)

def numSpikes(x):
    #TODO: implement
    sig.find_peaks(x, height=0, distance=100)
    pass

def averageTimeDomain(x):
    #TODO: implement
    return np.mean(x, axis=0)

def bandpower(x, fs, fmin, fmax):
    fs = 1000
    # win = 4 * sf
    freqs, psd = sig.welch(x, fs, axis=0, nperseg=x.shape[0])
    
    # Define delta lower and upper limits
    # fmin, fmax = 0.5, 4

    # Find intersecting values in frequency vector
    idx_delta = np.logical_and(freqs >= fmin, freqs <= fmax)
    
    from scipy.integrate import simps

    # Frequency resolution
    freq_res = freqs[1] - freqs[0]  # = 1 / 4 = 0.25

    # Compute the absolute power by approximating the area under the curve
    delta_power = simps(psd[idx_delta], dx=freq_res, axis=0)
    
    return delta_power

def spectral_entropy(x, fs=1000):
    # Calculate the power spectrum
    f, Pxx = sig.welch(x, fs=fs)
    # Normalize the power spectrum
    Pxx_norm = Pxx / Pxx.sum()
    # Calculate the spectral entropy
    se = -1 * (Pxx_norm * np.log2(Pxx_norm)).sum()
    return se

def hjorth_complexity(x):
    dx = np.diff(x)
    d2x = np.diff(dx)
    var_x = np.var(x)
    var_dx = np.var(dx)
    var_d2x = np.var(d2x)
    activity = var_x
    mobility = np.sqrt(var_d2x / var_dx)
    # Calculate Hjorth complexity
    complexity = mobility / activity
    return complexity
    
# Kurtosis = @(x) ((1/size(x,1))*sum((x - mean(x)).^4))./(((1/size(x,1))*sum((x - mean(x)).^2)).^2);
def Kurtosis(x):
    return ((1/x.shape[0])*np.sum((x - np.mean(x))**4))/(((1/x.shape[0])*np.sum((x - np.mean(x))**2))**2)

def Covariance(x):
    convar = np.cov(x, rowvar=False)
    feat = []
    for i in range(convar.shape[0]):
        feat += [convar[i, :i+1]]
    return np.concatenate(feat)

def get_features(filtered_window, fs=1000):
    """
        Write a function that calculates features for a given filtered window. 
        Feel free to use features you have seen before in this class, features that
        have been used in the literature, or design your own!

        Input: 
        filtered_window (window_samples x channels): the window of the filtered ecog signal 
        fs: sampling rate
        Output:
        features (channels x num_features): the features calculated on each channel for the window
    """
    feat_LL = LineLength(filtered_window)
    feat_Area = Area(filtered_window)
    feat_Energy = Energy(filtered_window)
    feat_ZCM = ZeroCrossingMean(filtered_window)
    feat_TimeAvg = averageTimeDomain(filtered_window)
#     feat_SpectralEntropy = spectral_entropy(filtered_window)
    feat_Hijorth = hjorth_complexity(filtered_window)
    feat_kurtosis = Kurtosis(filtered_window)
    feat_covariance = Covariance(filtered_window)
    # feat_FreqAvg = averageFreqDomain(filtered_window)
    
    from pyriemann.estimation import Covariances
    from pyriemann.tangentspace import TangentSpace
    
    # covar = Covariances().fit_transform(np.expand_dims(filtered_window.T, 0))
    # # covest = Covariances('oas')
    # # temp = np.expand_dims(filtered_window, axis=-1)
    # # covar = covest.fit_transform(temp)
    # ts = TangentSpace()
    # tsfeat = ts.fit_transform(covar)
    # # print(tsfeat.shape)

    # raise notImplementedError()
    return np.hstack([#feat_LL, 
                      #feat_Area, 
                      feat_covariance,
                      feat_Energy, 
                      #feat_ZCM, 
                      feat_TimeAvg, 
#                       feat_SpectralEntropy,
                      feat_Hijorth,
                      feat_kurtosis,
                    #   feat_covariance,
                      bandpower(filtered_window, 1000, 5, 15),
                      bandpower(filtered_window, 1000, 20, 25),
                      bandpower(filtered_window, 1000, 75, 115),
                      bandpower(filtered_window, 1000, 125, 160),
                      bandpower(filtered_window, 1000, 160, 175)])

In [100]:
def get_windowed_feats(raw_ecog, fs, window_length, window_overlap):
    """
        Write a function which processes data through the steps of filtering and
        feature calculation and returns features. Points will be awarded for completing
        each step appropriately (note that if one of the functions you call within this script
        returns a bad output, you won't be double penalized). Note that you will need
        to run the filter_data and get_features functions within this function. 

        Inputs:
        raw_eeg (samples x channels): the raw signal
        fs: the sampling rate (1000 for this dataset)
        window_length: the window's length
        window_overlap: the window's overlap
        Output: 
        all_feats (num_windows x (channels x features)): the features for each channel for each time window
            note that this is a 2D array. 
    """
    raw_ecog = filter_data(raw_ecog, fs)
    
    window_disp = window_length - window_overlap
    
    all_feats = np.vstack([get_features(raw_ecog[int(i * window_disp * fs):int(i * window_disp * fs + window_length * fs), :], fs) for i in range(NumWins(raw_ecog, fs, window_length, window_disp))])
    
    return all_feats

In [101]:
def create_R_matrix(features, N_wind):
    """ 
    Write a function to calculate the R matrix

    Input:
        features (samples (number of windows in the signal) x channels x features): 
        the features you calculated using get_windowed_feats
        N_wind: number of windows to use in the R matrix

    Output:
        R (samples x (N_wind*channels*features))
    """
    num_win = features.shape[0]
    num_channel_features = features.shape[1]
    
    # Append a copy of the first N-1 rows to the beginning of features
    features = np.vstack((features[:N_wind-1], features))
    
    R = np.zeros((num_win, N_wind * num_channel_features))
    
    for i in range(num_win):
        # Get the feature matrix for the current window
        # Resize the feature matrix and store in R
        R[i,:] = features[i:i+N_wind,:].reshape(-1)

    R = np.hstack((np.ones((R.shape[0], 1)), R))

    return R
    

In [104]:
winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap

# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_1_train, 1000, winLen, winOverlap)
R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_1_test, 1000, winLen, winOverlap)
R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_1_train
Y_test = data_glove_1_test
Y_train = sig.resample(Y_train, R_train.shape[0], axis=0)
Y_test = sig.resample(Y_test, R_test.shape[0], axis=0)

In [105]:
corre = np.corrcoef(feature_train, Y_train, rowvar=False)

In [106]:
np.nan_to_num(corre, copy=False, nan=0)

array([[ 1.        ,  0.25493011,  0.04593392, ..., -0.00276058,
        -0.02351143, -0.08008796],
       [ 0.25493011,  1.        ,  0.07340108, ..., -0.03964397,
        -0.02167498, -0.02115068],
       [ 0.04593392,  0.07340108,  1.        , ..., -0.02058149,
        -0.01985489,  0.02815554],
       ...,
       [-0.00276058, -0.03964397, -0.02058149, ...,  1.        ,
         0.09794208,  0.08317813],
       [-0.02351143, -0.02167498, -0.01985489, ...,  0.09794208,
         1.        , -0.12152057],
       [-0.08008796, -0.02115068,  0.02815554, ...,  0.08317813,
        -0.12152057,  1.        ]])

In [107]:
idx = np.unravel_index(np.argsort(corre[:-4, -4:].ravel())[-800:], corre[:-4, -4:].shape)

In [108]:
idx_ = idx[0][:-4]

In [111]:
idx_ = np.unique(idx_)
idx_

array([   2,    7,    9,   11,   14,   15,   16,   18,   20,   22,   24,
         26,   27,   28,   35,   37,   39,   41,   42,   44,   46,   50,
         54,   56,   60,   63,   64,   74,   81,   83,   85,   88,   90,
         92,   94,   95,   96,   97,   99,  100,  103,  104,  110,  118,
        119,  121,  122,  123,  126,  127,  131,  132,  133,  136,  137,
        143,  145,  148,  149,  152,  154,  158,  161,  165,  169,  170,
        172,  175,  177,  178,  179,  182,  184,  186,  187,  188,  189,
        193,  196,  202,  203,  207,  211,  213,  214,  216,  217,  220,
        222,  223,  225,  227,  228,  229,  230,  236,  241,  252,  301,
        304,  305,  306,  309,  311,  312,  313,  315,  316,  317,  324,
        327,  329,  330,  334,  337,  338,  341,  346,  350,  352,  355,
        356,  357,  360,  361,  363,  364,  365,  370,  371,  372,  375,
        376,  377,  378,  379,  391,  394,  396,  397,  412,  413,  416,
        419,  421,  422,  433,  434,  437,  440,  4

In [112]:
feature_train_ = feature_train[:, idx_]
feature_test_ = feature_test[:, idx_]

In [113]:
R_train = create_R_matrix(feature_train, 5)
R_test = create_R_matrix(feature_test, 5)

In [76]:
# Compute the weights
# f_train = np.linalg.pinv(R_train.T @ R_train) @ (R_train.T @ Y_train)

# prediction_LR = (R_test @ f_train)

In [115]:
import xgboost as xgb
from xgboost import XGBRegressor
# read data

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)

KeyboardInterrupt: 

In [None]:
def correlation(prediction, target):
    corr = [pearsonr(prediction[:,i], target[:,i]).statistic for i in range(4)]
    return corr, np.mean(corr)

In [None]:
print('For Subject 1')
# print(f'For linear regression: {correlation(prediction_LR, Y_test)}')
print(f'For XGBoost: {correlation(prediction_XGB, Y_test)}')

For Subject 1
For XGBoost: ([0.28746230332385686, 0.358173949666523, 0.031368871377739554, 0.07395861025586921], 0.18774093365599717)


In [None]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_2_train, 1000, winLen, winOverlap)
R_train = create_R_matrix(feature_train, 3)

feature_test = get_windowed_feats(ecog_2_test, 1000, winLen, winOverlap)
R_test = create_R_matrix(feature_test, 3)

# Downsample the glove data
Y_train = data_glove_2_train
Y_test = data_glove_2_test
Y_train = sig.resample(Y_train, R_train.shape[0], axis=0)
Y_test = sig.resample(Y_test, R_test.shape[0], axis=0)

# Compute the weights
f_train = np.linalg.pinv(R_train.T @ R_train) @ (R_train.T @ Y_train)
prediction_LR = (R_test @ f_train)

from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=200, max_depth=5, max_leaf_nodes=200)
rf_reg.fit(R_train, Y_train)
prediction_RF = rf_reg.predict(R_test)

# create model instance
xgb_reg = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)


print('For Subject 2')
print(f'For linear regression: {[pearsonr(prediction_LR[:,i], Y_test[:,i]).statistic for i in range(4)]}')
print(f'For random forest: {[pearsonr(prediction_RF[:,i], Y_test[:,i]).statistic for i in range(4)]}')
print(f'For XGBoost: {[pearsonr(prediction_XGB[:,i], Y_test[:,i]).statistic for i in range(4)]}')

In [18]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_3_train, 1000, winLen, winOverlap)
R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_3_test, 1000, winLen, winOverlap)
R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_3_train
Y_test = data_glove_3_test
Y_train = sig.resample(Y_train, R_train.shape[0], axis=0)
Y_test = sig.resample(Y_test, R_test.shape[0], axis=0)

# Compute the weights
f_train = np.linalg.pinv(R_train.T @ R_train) @ (R_train.T @ Y_train)
prediction_LR = (R_test @ f_train)

from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5, max_leaf_nodes=200)
rf_reg.fit(R_train, Y_train)
prediction_RF = rf_reg.predict(R_test)

# create model instance
xgb_reg = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)

print('For Subject 3')
print(f'For linear regression: {[pearsonr(prediction_LR[:,i], Y_test[:,i]).statistic for i in range(4)]}')
print(f'For random forest: {[pearsonr(prediction_RF[:,i], Y_test[:,i]).statistic for i in range(4)]}')
print(f'For XGBoost: {[pearsonr(prediction_XGB[:,i], Y_test[:,i]).statistic for i in range(4)]}')

For Subject 3
For linear regression: [0.515557634501014, 0.2753656952491606, 0.3861286395818362, 0.39087312864622037]
For random forest: [0.7243048760525532, 0.5249419449942999, 0.6476626372046566, 0.42160591268365893]
For XGBoost: [0.7026111184226042, 0.4022655712154394, 0.6120286981874062, 0.4094253084822335]


In [31]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_1, 1000, winLen, winOverlap)
R_train = create_R_matrix(feature_train, 5)

# Downsample the glove data
Y_train = data_glove_1
Y_train = sig.resample(Y_train, R_train.shape[0], axis=0)

# Compute the weights
f_train = np.linalg.pinv(R_train.T @ R_train) @ (R_train.T @ Y_train)

# from sklearn.ensemble import RandomForestRegressor
# rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5, max_leaf_nodes=200)
# rf_reg.fit(R_train, Y_train)


# np.save('./models/LR_Matrix_S1', f_train)
# pickle.dump(rf_reg, open('./models/RF_Matrix_S1.pth', 'wb'))

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
# prediction_XGB = xgb_reg.predict(R_test)


xgb_reg.save_model('./models/XGB_S1.json')

In [32]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_2, 1000, winLen, winOverlap)
R_train = create_R_matrix(feature_train, 5)

# Downsample the glove data
Y_train = data_glove_2
Y_train = sig.resample(Y_train, R_train.shape[0], axis=0)

# Compute the weights
f_train = np.linalg.pinv(R_train.T @ R_train) @ (R_train.T @ Y_train)

# from sklearn. ensemble import RandomForestRegressor
# rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5, max_leaf_nodes=200)
# rf_reg.fit(R_train, Y_train)


# np.save('./models/LR_Matrix_S2', f_train)
# pickle.dump(rf_reg, open('./models/RF_Matrix_S2.pth', 'wb'))

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
# prediction_XGB = xgb_reg.predict(R_test)


xgb_reg.save_model('./models/XGB_S2.json')



In [33]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_3, 1000, winLen, winOverlap)
R_train = create_R_matrix(feature_train, 5)

# Downsample the glove data
Y_train = data_glove_3
Y_train = sig.resample(Y_train, R_train.shape[0], axis=0)

# Compute the weights
f_train = np.linalg.pinv(R_train.T @ R_train) @ (R_train.T @ Y_train)

# from sklearn.ensemble import RandomForestRegressor
# rf_reg = RandomForestRegressor(n_estimators=100, max_depth=5, max_leaf_nodes=200)
# rf_reg.fit(R_train, Y_train)


# np.save('./models/LR_Matrix_S3', f_train)
# pickle.dump(rf_reg, open('./models/RF_Matrix_S3.pth', 'wb'))

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
# prediction_XGB = xgb_reg.predict(R_test)


xgb_reg.save_model('./models/XGB_S3.json')



In [32]:
R_train.shape

(5999, 3521)