# Training and Model Saving Notebook

## 1. Import necessary libraries

In [1]:
#Set up the notebook environment
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scipy
from scipy.stats import pearsonr
from scipy import signal as sig
from utils import *
import xgboost as xgb
from xgboost import XGBRegressor

## 2. Read Data

In [2]:
raw = scipy.io.loadmat('./datasets/raw_training_data.mat')
data_glove_1 = raw['train_dg'][0][0]
data_glove_1_train = np.delete(data_glove_1, 3, 1)
data_glove_2 = raw['train_dg'][1][0]
data_glove_2_train = np.delete(data_glove_2, 3, 1)
data_glove_3 = raw['train_dg'][2][0]
data_glove_3_train = np.delete(data_glove_3, 3, 1)

ecog_1_train = raw['train_ecog'][0][0]
ecog_2_train = raw['train_ecog'][1][0]
ecog_3_train = raw['train_ecog'][2][0]


raw = scipy.io.loadmat('./datasets/sub1_comp.mat')
ecog_1_comp = raw['train_data']
dg_1_comp = raw['train_dg']
ecog_1_valid = raw['test_data'][49000:]

raw = scipy.io.loadmat('./datasets/sub2_comp.mat')
ecog_2_comp = raw['train_data']
dg_2_comp = raw['train_dg']
ecog_2_valid = raw['test_data'][49000:]

raw = scipy.io.loadmat('./datasets/sub3_comp.mat')
ecog_3_comp = raw['train_data']
dg_3_comp = raw['train_dg']
ecog_3_valid = raw['test_data'][49000:]

dg_1_raw = scipy.io.loadmat('./datasets/sub1_testlabels.mat')
dg_1_valid = dg_1_raw['test_dg'][49000:]
dg_1_valid = np.delete(dg_1_valid, 3, 1)

dg_2_raw = scipy.io.loadmat('./datasets/sub2_testlabels.mat')
dg_2_valid = dg_2_raw['test_dg'][49000:]
dg_2_valid = np.delete(dg_2_valid, 3, 1)

dg_3_raw = scipy.io.loadmat('./datasets/sub3_testlabels.mat')
dg_3_valid = dg_3_raw['test_dg'][49000:]
dg_3_valid = np.delete(dg_3_valid, 3, 1)

## 3. Compute features and Train

In [3]:
winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap

In [6]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_1_train, 1000, winLen, winOverlap)
# R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_1_valid, 1000, winLen, winOverlap)
# R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_1_train
Y_test = dg_1_valid
Y_train = sig.resample(Y_train, feature_train.shape[0], axis=0)
Y_test = sig.resample(Y_test, feature_test.shape[0], axis=0)

R_train = create_R_matrix(feature_train, 20)
R_test = create_R_matrix(feature_test, 20)

idx_1 = feature_selection(R_train, Y_train, 800)
print(idx_1.shape)

R_train = R_train[:, idx_1]
R_test = R_test[:, idx_1]

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
prediction_XGB_1 = xgb_reg.predict(R_test)

print('For Subject 1')
print(f'For XGBoost: {correlation(prediction_XGB_1, Y_test)}')

xgb_reg.save_model('./models/XGB_S1.json')
np.save('./models/idx_S1.npy', idx_1)



(615,)
For Subject 1
For XGBoost: ([0.5697004646300975, 0.7127210821071518, 0.19972146484178074, 0.3181432431150917], 0.4500715636735304)


In [5]:
from lightgbm import LGBMRegressor

lgbm_reg_list = [LGBMRegressor(n_estimators=1000, max_depth=20, learning_rate=0.01) for _ in range(4)]

for i in range(4):
    lgbm_reg_list[i].fit(R_train, Y_train[:,i])
    lgbm_reg_list[i].booster_.save_model(f'./models/lgbr_f{i}_S1.txt')

prediction_lgbm_list = [lgbm_reg.predict(R_test) for lgbm_reg in lgbm_reg_list]
prediction_lgbm_1 = np.vstack(prediction_lgbm_list).T
print('For Subject 1')
print(f'For LightGBM: {correlation(prediction_lgbm_1, Y_test)}')

prediction_ensemble = (prediction_XGB_1 + prediction_lgbm_1) / 2
print(f'For ensemble: {correlation(prediction_ensemble, Y_test)}')

For Subject 1
For LightGBM: ([0.6193401464499385, 0.75773013227357, 0.287864758762844, 0.3264518893794178], 0.49784673171644256)
For ensemble: ([0.6075949935248183, 0.7485007880768504, 0.2720162168539936, 0.3353982537734647], 0.49087756305728175)


In [6]:
winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap

# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_2_train, 1000, winLen, winOverlap)
# R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_2_valid, 1000, winLen, winOverlap)
# R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_2_train
Y_test = dg_2_valid
Y_train = sig.resample(Y_train, feature_train.shape[0], axis=0)
Y_test = sig.resample(Y_test, feature_test.shape[0], axis=0)


R_train = create_R_matrix(feature_train, 20)
R_test = create_R_matrix(feature_test, 20)

idx_2 = feature_selection(R_train, Y_train, 800)
print(idx_2.shape)

R_train = R_train[:, idx_2]
R_test = R_test[:, idx_2]

R_train.shape



(549,)


(5999, 549)

In [7]:
# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)

print('For Subject 2')
print(f'For XGBoost: {correlation(prediction_XGB, Y_test)}')

xgb_reg.save_model('./models/XGB_S2.json')
np.save('./models/idx_S2.npy', idx_2)

For Subject 2
For XGBoost: ([0.5933382893119988, 0.3820826422678676, 0.2489812591320926, 0.2591780080679073], 0.3708950496949666)


In [8]:
from lightgbm import LGBMRegressor

lgbm_reg_list = [LGBMRegressor(n_estimators=1000, max_depth=20, learning_rate=0.01) for _ in range(4)]

for i in range(4):
    lgbm_reg_list[i].fit(R_train, Y_train[:,i])
    lgbm_reg_list[i].booster_.save_model(f'./models/lgbr_f{i}_S2.txt')

prediction_lgbm_list = [lgbm_reg.predict(R_test) for lgbm_reg in lgbm_reg_list]
prediction_lgbm_2 = np.vstack(prediction_lgbm_list).T
print('For Subject 2')
print(f'For LightGBM: {correlation(prediction_lgbm_2, Y_test)}')

prediction_ensemble = (prediction_XGB + prediction_lgbm_2) / 2
print(f'For ensemble: {correlation(prediction_ensemble, Y_test)}')

For Subject 2
For LightGBM: ([0.6120384671957414, 0.4291699576228054, 0.27032010324418687, 0.2653151439637098], 0.39421091800661084)
For ensemble: ([0.6117157991861485, 0.4225477384405255, 0.26798196146776515, 0.27458809744134943], 0.3942083991339472)


In [9]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_3_train, 1000, winLen, winOverlap)
# R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_3_valid, 1000, winLen, winOverlap)
# R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_3_train
Y_test = dg_3_valid
Y_train = sig.resample(Y_train, feature_train.shape[0], axis=0)
Y_test = sig.resample(Y_test, feature_test.shape[0], axis=0)

R_train = create_R_matrix(feature_train, 20)
R_test = create_R_matrix(feature_test, 20)

idx_3 = feature_selection(R_train, Y_train, 800)
print(idx_3.shape)

R_train = R_train[:, idx_3]
R_test = R_test[:, idx_3]

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R_train, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)

print('For Subject 3')
print(f'For XGBoost: {correlation(prediction_XGB, Y_test)}')

xgb_reg.save_model('./models/XGB_S3.json')
np.save('./models/idx_S3.npy', idx_3)



(599,)
For Subject 3
For XGBoost: ([0.7752297742295375, 0.6422236814131032, 0.6144125437786533, 0.6950473981200583], 0.6817283493853381)


In [10]:
from lightgbm import LGBMRegressor

lgbm_reg_list = [LGBMRegressor(n_estimators=1000, max_depth=20, learning_rate=0.01) for _ in range(4)]

for i in range(4):
    lgbm_reg_list[i].fit(R_train, Y_train[:,i])
    lgbm_reg_list[i].booster_.save_model(f'./models/lgbr_f{i}_S3.txt')

prediction_lgbm_list = [lgbm_reg.predict(R_test) for lgbm_reg in lgbm_reg_list]
prediction_lgbm_3 = np.vstack(prediction_lgbm_list).T
print('For Subject 3')
print(f'For LightGBM: {correlation(prediction_lgbm_3, Y_test)}')

prediction_ensemble = (prediction_XGB + prediction_lgbm_3) / 2
print(f'For ensemble: {correlation(prediction_ensemble, Y_test)}')

For Subject 3
For LightGBM: ([0.7834503219536425, 0.6847308209774405, 0.6244810779215075, 0.7405856810291012], 0.7083119754704228)
For ensemble: ([0.7846392928883842, 0.6753867365057135, 0.6246371602515357, 0.7307276577612088], 0.7038477118517106)


In [5]:
ecog_1_leaderboard = ecog_1_comp[500: 500 + 147500]
dg_1_leaderboard = dg_1_comp[500: 500 + 147500]

ecog_2_leaderboard = ecog_2_comp[500: 500 + 147500]
dg_2_leaderboard = dg_2_comp[500: 500 + 147500]

ecog_3_leaderboard = ecog_3_comp[500: 500 + 147500]
dg_3_leaderboard = dg_3_comp[500: 500 + 147500]

winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap


feature_1 = get_windowed_feats(ecog_1_leaderboard, 1000, winLen, winOverlap)
# R_1 = create_R_matrix(feature_1, 5)
feature_2 = get_windowed_feats(ecog_2_leaderboard, 1000, winLen, winOverlap)
# R_2 = create_R_matrix(feature_2, 5)
feature_3 = get_windowed_feats(ecog_3_leaderboard, 1000, winLen, winOverlap)
# R_3 = create_R_matrix(feature_3, 5)

idx_1 = np.load('./models/idx_S1.npy')
idx_2 = np.load('./models/idx_S2.npy')
idx_3 = np.load('./models/idx_S3.npy')

R_1 = create_R_matrix(feature_1, 20)[:, idx_1]
R_2 = create_R_matrix(feature_2, 20)[:, idx_2]
R_3 = create_R_matrix(feature_3, 20)[:, idx_3]

R_list = [R_1, R_2, R_3]



In [9]:
import lightgbm
predictions = []
for i in range(3):
    # Load XGB
    xgb_reg = xgb.XGBRegressor()
    xgb_reg.load_model(f"./models/XGB_S{i + 1}.json")

    prediction_xgb = xgb_reg.predict(R_list[i])
    
    # Load LGBM
    lgbm_reg_list = [lightgbm.Booster(model_file=f'./models/lgbr_f{j}_S{i + 1}.txt') for j in range(4)]
    
    prediction_lgbm_list = [lgbm_reg.predict(R_list[i]) for lgbm_reg in lgbm_reg_list]
    prediction_lgbm = np.vstack(prediction_lgbm_list).T
    
    
    prediction = prediction_lgbm
    predictions.append(prediction)

In [10]:
print('For Subject 1')
print(f'For XGBoost: {correlation(sig.resample(predictions[0], dg_1_leaderboard.shape[0]), dg_1_leaderboard)}')

print('For Subject 2')
print(f'For XGBoost: {correlation(sig.resample(predictions[1], dg_2_leaderboard.shape[0]), dg_2_leaderboard)}')

print('For Subject 3')
print(f'For XGBoost: {correlation(sig.resample(predictions[2], dg_3_leaderboard.shape[0]), dg_3_leaderboard)}')

For Subject 1
For XGBoost: ([0.5907641612580329, 0.7977617513623582, 0.4600879272260482, 0.3639373927544931], 0.5531378081502332)
For Subject 2
For XGBoost: ([0.6507526931463948, 0.5608567789524364, 0.5487816884251456, 0.35826305816146226], 0.5296635546713598)
For Subject 3
For XGBoost: ([0.7403539510044402, 0.7341086861577378, 0.6143714085152925, 0.5589147654194959], 0.6619372027742416)


In [22]:
raw = scipy.io.loadmat('./leaderboard_prediction.mat')
raw['predicted_dg'].shape
prediction_1 = raw['predicted_dg'][2][0]
prediction_1 = np.delete(prediction_1, 3, 1)

In [24]:
correlation(prediction_1, dg_3_leaderboard)

([0.728168426187513,
  0.723727685575862,
  0.5954600848958687,
  0.5552881706938677],
 0.6506610918382778)