# Training and Model Saving Notebook

## 1. Import necessary libraries

In [28]:
#Set up the notebook environment
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scipy
from scipy.stats import pearsonr
from scipy import signal as sig
from utils import *
import xgboost as xgb
from xgboost import XGBRegressor

## 2. Read Data

In [29]:
raw = scipy.io.loadmat('./datasets/raw_training_data.mat')
data_glove_1 = raw['train_dg'][0][0]
data_glove_1_train = np.delete(data_glove_1, 3, 1)
data_glove_2 = raw['train_dg'][1][0]
data_glove_2_train = np.delete(data_glove_2, 3, 1)
data_glove_3 = raw['train_dg'][2][0]
data_glove_3_train = np.delete(data_glove_3, 3, 1)

ecog_1_train = raw['train_ecog'][0][0]
ecog_2_train = raw['train_ecog'][1][0]
ecog_3_train = raw['train_ecog'][2][0]


raw = scipy.io.loadmat('./datasets/sub1_comp.mat')
ecog_1_comp = raw['train_data']
dg_1_comp = raw['train_dg']
ecog_1_valid = raw['test_data'][49000:]

raw = scipy.io.loadmat('./datasets/sub2_comp.mat')
ecog_2_comp = raw['train_data']
dg_2_comp = raw['train_dg']
ecog_2_valid = raw['test_data'][49000:]

raw = scipy.io.loadmat('./datasets/sub3_comp.mat')
ecog_3_comp = raw['train_data']
dg_3_comp = raw['train_dg']
ecog_3_valid = raw['test_data'][49000:]

dg_1_raw = scipy.io.loadmat('./datasets/sub1_testlabels.mat')
dg_1_valid = dg_1_raw['test_dg'][49000:]
dg_1_valid = np.delete(dg_1_valid, 3, 1)

dg_2_raw = scipy.io.loadmat('./datasets/sub2_testlabels.mat')
dg_2_valid = dg_2_raw['test_dg'][49000:]
dg_2_valid = np.delete(dg_2_valid, 3, 1)

dg_3_raw = scipy.io.loadmat('./datasets/sub3_testlabels.mat')
dg_3_valid = dg_3_raw['test_dg'][49000:]
dg_3_valid = np.delete(dg_3_valid, 3, 1)

## 3. Compute features and Train

In [30]:
winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap

In [31]:
def moving_average(x, winLen, winDisp):
    result = []
    for i in range(NumWins(x, 1000, winLen, winDisp)):
        result.append(x[i * int(winDisp * 1000):i * int(winDisp * 1000) + int(winLen * 1000)].mean(axis=0))
    return np.array(result)

In [32]:
moving_average(data_glove_1_train, winLen, winDisp).shape

(5999, 4)

In [None]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_1_train, 1000, winLen, winOverlap)
# R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_1_valid, 1000, winLen, winOverlap)
# R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_1_train
Y_test = dg_1_valid
Y_train = moving_average(Y_train, winLen, winDisp)
Y_test = moving_average(Y_test, winLen, winDisp)

R = create_R_matrix(feature_train, 20)
R_test = create_R_matrix(feature_test, 20)

idx_1 = feature_selection(R, Y_train, 800)
print(idx_1.shape)

R = R[:, idx_1]
R_test = R_test[:, idx_1]



In [49]:
# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=20, learning_rate=0.01)
# fit model
xgb_reg.fit(R, Y_train)
# make predictions
prediction_XGB_1 = xgb_reg.predict(R_test)

print('For Subject 1')
print(f'For XGBoost: {correlation(prediction_XGB_1, Y_test)}')

xgb_reg.save_model('./models/XGB_S1.json')
np.save('./models/idx_S1.npy', idx_1)

For Subject 1
For XGBoost: ([0.4739681823029282, 0.6634626449660738, 0.1590766612353765, 0.23771983478446282], 0.3835568308222103)


In [35]:
from lightgbm import LGBMRegressor

lgbm_reg_list = [LGBMRegressor(n_estimators=1000, max_depth=20, learning_rate=0.01) for _ in range(4)]

for i in range(4):
    lgbm_reg_list[i].fit(R, Y_train[:,i])
    lgbm_reg_list[i].booster_.save_model(f'./models/lgbr_f{i}_S1.txt')

prediction_lgbm_list = [lgbm_reg.predict(R_test) for lgbm_reg in lgbm_reg_list]
prediction_lgbm_1 = np.vstack(prediction_lgbm_list).T
print('For Subject 1')
print(f'For LightGBM: {correlation(prediction_lgbm_1, Y_test)}')

prediction_ensemble = (prediction_XGB_1 + prediction_lgbm_1) / 2
print(f'For ensemble: {correlation(prediction_ensemble, Y_test)}')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149930
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 617
[LightGBM] [Info] Start training from score -0.031320
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149930
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 617
[LightGBM] [Info] Start training from score -0.026964
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149930
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 617
[LightGBM] [Info] Start training from score -0.057710
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 149930
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 617
[LightGBM] [Info] Start training from score -0.026658
For Subject 1
For LightGBM: ([0.6398

In [43]:
winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap

# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_2_train, 1000, winLen, winOverlap)
# R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_2_valid, 1000, winLen, winOverlap)
# R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_2_train
Y_test = dg_2_valid
Y_train = moving_average(Y_train, winLen, winDisp)
Y_test = moving_average(Y_test, winLen, winDisp)
# Y_train = sig.resample(Y_train, feature_train.shape[0], axis=0)
# Y_test = sig.resample(Y_test, feature_test.shape[0], axis=0)


R = create_R_matrix(feature_train, 20)
R_test = create_R_matrix(feature_test, 20)

idx_2 = feature_selection(R, Y_train, 800)
print(idx_2.shape)

R = R[:, idx_2]
R_test = R_test[:, idx_2]

R.shape



(551,)


(5999, 551)

In [44]:
# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)

print('For Subject 2')
print(f'For XGBoost: {correlation(prediction_XGB, Y_test)}')

xgb_reg.save_model('./models/XGB_S2.json')
np.save('./models/idx_S2.npy', idx_2)

For Subject 2
For XGBoost: ([0.6014648549363719, 0.3972725971419602, 0.24884421381154292, 0.24780762636223888], 0.3738473230630285)


In [45]:
from lightgbm import LGBMRegressor

lgbm_reg_list = [LGBMRegressor(n_estimators=1000, max_depth=20, learning_rate=0.001) for _ in range(4)]

for i in range(4):
    lgbm_reg_list[i].fit(R, Y_train[:,i])
    lgbm_reg_list[i].booster_.save_model(f'./models/lgbr_f{i}_S2.txt')

prediction_lgbm_list = [lgbm_reg.predict(R_test) for lgbm_reg in lgbm_reg_list]
prediction_lgbm_2 = np.vstack(prediction_lgbm_list).T
print('For Subject 2')
print(f'For LightGBM: {correlation(prediction_lgbm_2, Y_test)}')

prediction_ensemble = (prediction_XGB + prediction_lgbm_2) / 2
print(f'For ensemble: {correlation(prediction_ensemble, Y_test)}')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125585
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 551
[LightGBM] [Info] Start training from score -0.150330
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125585
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 551
[LightGBM] [Info] Start training from score -0.115466
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125585
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 551
[LightGBM] [Info] Start training from score 0.104593
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125585
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 551
[LightGBM] [Info] Start training from score -0.015761
For Subject 2
For LightGBM: ([0.59369

In [46]:
# Compute the R matrix for the training data
feature_train = get_windowed_feats(ecog_3_train, 1000, winLen, winOverlap)
# R_train = create_R_matrix(feature_train, 5)

feature_test = get_windowed_feats(ecog_3_valid, 1000, winLen, winOverlap)
# R_test = create_R_matrix(feature_test, 5)

# Downsample the glove data
Y_train = data_glove_3_train
Y_test = dg_3_valid
Y_train = moving_average(Y_train, winLen, winDisp)
Y_test = moving_average(Y_test, winLen, winDisp)
# Y_train = sig.resample(Y_train, feature_train.shape[0], axis=0)
# Y_test = sig.resample(Y_test, feature_test.shape[0], axis=0)

R = create_R_matrix(feature_train, 20)
R_test = create_R_matrix(feature_test, 20)

idx_3 = feature_selection(R, Y_train, 800)
print(idx_3.shape)

R = R[:, idx_3]
R_test = R_test[:, idx_3]

# create model instance
xgb_reg = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.01)
# fit model
xgb_reg.fit(R, Y_train)
# make predictions
prediction_XGB = xgb_reg.predict(R_test)

print('For Subject 3')
print(f'For XGBoost: {correlation(prediction_XGB, Y_test)}')

xgb_reg.save_model('./models/XGB_S3.json')
np.save('./models/idx_S3.npy', idx_3)



(597,)
For Subject 3
For XGBoost: ([0.7754575208444133, 0.6365313068913252, 0.6105067438980203, 0.6916622288288553], 0.6785394501156536)


In [47]:
from lightgbm import LGBMRegressor

lgbm_reg_list = [LGBMRegressor(n_estimators=1000, max_depth=20, learning_rate=0.01) for _ in range(4)]

for i in range(4):
    lgbm_reg_list[i].fit(R, Y_train[:,i])
    lgbm_reg_list[i].booster_.save_model(f'./models/lgbr_f{i}_S3.txt')

prediction_lgbm_list = [lgbm_reg.predict(R_test) for lgbm_reg in lgbm_reg_list]
prediction_lgbm_3 = np.vstack(prediction_lgbm_list).T
print('For Subject 3')
print(f'For LightGBM: {correlation(prediction_lgbm_3, Y_test)}')

prediction_ensemble = (prediction_XGB + prediction_lgbm_3) / 2
print(f'For ensemble: {correlation(prediction_ensemble, Y_test)}')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150111
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 597
[LightGBM] [Info] Start training from score 0.081512
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150111
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 597
[LightGBM] [Info] Start training from score 0.037428
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150111
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 597
[LightGBM] [Info] Start training from score 0.016610
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150111
[LightGBM] [Info] Number of data points in the train set: 5999, number of used features: 597
[LightGBM] [Info] Start training from score -0.065185
For Subject 3
For LightGBM: ([0.7872705

In [58]:
ecog_1_leaderboard = ecog_1_comp[500: 500 + 147500]
dg_1_leaderboard = dg_1_comp[500: 500 + 147500]

ecog_2_leaderboard = ecog_2_comp[500: 500 + 147500]
dg_2_leaderboard = dg_2_comp[500: 500 + 147500]

ecog_3_leaderboard = ecog_3_comp[500: 500 + 147500]
dg_3_leaderboard = dg_3_comp[500: 500 + 147500]

dg_1_leaderboard = np.delete(dg_1_leaderboard, 3, 1)
dg_2_leaderboard = np.delete(dg_2_leaderboard, 3, 1)
dg_3_leaderboard = np.delete(dg_3_leaderboard, 3, 1)


winLen = 100 / 1e3
winOverlap = 50 / 1e3
winDisp = winLen - winOverlap


feature_1 = get_windowed_feats(ecog_1_leaderboard, 1000, winLen, winOverlap)
# R_1 = create_R_matrix(feature_1, 5)
feature_2 = get_windowed_feats(ecog_2_leaderboard, 1000, winLen, winOverlap)
# R_2 = create_R_matrix(feature_2, 5)
feature_3 = get_windowed_feats(ecog_3_leaderboard, 1000, winLen, winOverlap)
# R_3 = create_R_matrix(feature_3, 5)

idx_1 = np.load('./models/idx_S1.npy')
idx_2 = np.load('./models/idx_S2.npy')
idx_3 = np.load('./models/idx_S3.npy')

R_1 = create_R_matrix(feature_1, 20)[:, idx_1]
R_2 = create_R_matrix(feature_2, 20)[:, idx_2]
R_3 = create_R_matrix(feature_3, 20)[:, idx_3]

R_list = [R_1, R_2, R_3]



In [55]:
import lightgbm
predictions = []
for i in range(3):
    # Load XGB
    xgb_reg = xgb.XGBRegressor()
    xgb_reg.load_model(f"./models/XGB_S{i + 1}.json")

    prediction_xgb = xgb_reg.predict(R_list[i])
    
    # Load LGBM
    lgbm_reg_list = [lightgbm.Booster(model_file=f'./models/lgbr_f{j}_S{i + 1}.txt') for j in range(4)]
    
    prediction_lgbm_list = [lgbm_reg.predict(R_list[i]) for lgbm_reg in lgbm_reg_list]
    prediction_lgbm = np.vstack(prediction_lgbm_list).T
    
    
    prediction = prediction_lgbm
    predictions.append(prediction)

In [56]:
print('For Subject 1')
print(f'For XGBoost: {correlation(sig.resample(predictions[0], dg_1_leaderboard.shape[0]), dg_1_leaderboard)}')

print('For Subject 2')
print(f'For XGBoost: {correlation(sig.resample(predictions[1], dg_2_leaderboard.shape[0]), dg_2_leaderboard)}')

print('For Subject 3')
print(f'For XGBoost: {correlation(sig.resample(predictions[2], dg_3_leaderboard.shape[0]), dg_3_leaderboard)}')

For Subject 1
For XGBoost: ([0.5907641612580329, 0.7977617513623582, 0.4600879272260482, 0.27465960273210904], 0.530818360644637)
For Subject 2
For XGBoost: ([0.6250923349801124, 0.5283428967113556, 0.45170794050619206, 0.42148679872941325], 0.5066574927317683)
For Subject 3
For XGBoost: ([0.7403539510044402, 0.7341086861577378, 0.6143714085152925, 0.7867964007526729], 0.7189076116075359)


In [12]:
p1 = sig.resample(predictions[0], dg_1_leaderboard.shape[0])
p2 = sig.resample(predictions[1], dg_2_leaderboard.shape[0])
p3 = sig.resample(predictions[2], dg_3_leaderboard.shape[0])

p = np.concatenate((p1, p2, p3), axis=0)
dg = np.concatenate((dg_1_leaderboard, dg_2_leaderboard, dg_3_leaderboard), axis=0)

In [55]:
predictions = scipy.io.loadmat('leaderboard_prediction.mat')['predicted_dg']
prediction1 = predictions[0][0][:, [0,1,2,4]]
prediction2 = predictions[1][0][:, [0,1,2,4]]
prediction3 = predictions[2][0][:, [0,1,2,4]]

In [56]:
predictions[0][0][:, [0,1,2,4]]

array([[-0.25594257, -0.2087674 , -0.0428858 , -0.13051725],
       [-0.25594257, -0.2087674 , -0.0428858 , -0.13051725],
       [-0.25594257, -0.2087674 , -0.0428858 , -0.13051725],
       ...,
       [-0.38377505,  0.06773657, -0.01280759,  1.42021552],
       [-0.38377505,  0.06773657, -0.01280759,  1.42021552],
       [-0.38377505,  0.06773657, -0.01280759,  1.42021552]])

In [59]:
print('For Subject 1')
print(f'For XGBoost: {correlation(prediction1, dg_1_leaderboard)}')

print('For Subject 2')
print(f'For XGBoost: {correlation(prediction2, dg_2_leaderboard)}')

print('For Subject 3')
print(f'For XGBoost: {correlation(prediction3, dg_3_leaderboard)}')

For Subject 1
For XGBoost: ([0.6597119562392924, 0.8263029671949991, 0.5359963726003356, 0.3729523962118694], 0.5987409230616241)
For Subject 2
For XGBoost: ([0.7069395067689479, 0.6064309370067075, 0.6143715293495223, 0.6165457399913061], 0.636071928279121)
For Subject 3
For XGBoost: ([0.8027001923945973, 0.7969405443623298, 0.6613162078526109, 0.8198776673989427], 0.7702086530021202)


In [54]:
import lightgbm
import lightgbm
lightgbm.Booster(model_file=f'./models/lgbr_f{0}_S{1}.txt')

<lightgbm.basic.Booster at 0x168869e90>

In [68]:
(0.5987409230616241 +  0.636071928279121 + 0.7702086530021202) / 3

0.6683405014476218