# 1-step Forecasting with linear and non-linear models (Nomothetic)

In [21]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import utils

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [22]:
# Reading alcohol data
train_df, test_df, data_raw_list = utils.load_alcohol()

combined_data = []

train_alcohol = pd.concat(train_df, ignore_index=True)
train_alcohol_X = train_alcohol.drop(train_alcohol.columns[range(0, 61)], axis=1).fillna(0)
train_alcohol_y = train_alcohol['craving']


def prepare_data(idx, test_list):
    # print('Patient ID:', test_list[idx]['ID'][0])
    X_test = test_list[idx].drop(test_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_test = test_list[idx]['craving']
    return X_test, y_test


In [4]:
# Loading train and test data of covid patients
covid_train_x_list, covid_test_x_list, covid_train_y_list, covid_test_y_list = utils.patients_covid()

covid_train_X = pd.concat(covid_train_x_list, ignore_index=True)
covid_train_y = pd.concat(covid_train_y_list, ignore_index=True)


Patient included in study:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 48, 49, 50, 52, 53, 54, 55, 57, 58, 59, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 77, 78]


# 1. Nomothetic Models Regression

In [6]:
# Elastic-Net

def elastic_net(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.01, 0.6, 0.05)
    elastic_reg = lm.ElasticNetCV(alphas=np.arange(0.01, 20, 0.05), l1_ratio=l1_ratios, cv=5, max_iter=100000,
                                  fit_intercept=True)
    elastic_reg.fit(X_train_loc, train_y)
    y_predicted_test = elastic_reg.predict(X_test_loc)

    print('--- Elastic-Net Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)

    return elastic_reg


print('--- Alcohol Data ---')
test_alcohol_X, test_alcohol_y = prepare_data(1, test_df)
elastic_alcohol = elastic_net(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
elastic_covid = elastic_net(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
--- Elastic-Net Global Results ---
R_squared: 0.30379728867934985
MAPE: 0.8727735940920615
RMSE: 20.49355174027907
MAE: 18.52613409884371
CORR: 0.611521469749715
--- Covid Data ---
--- Elastic-Net Global Results ---
R_squared: 0.3229011269661808
MAPE: 0.3886173418152433
RMSE: 0.4984005787041621
MAE: 0.45580592466727055
CORR: 0.8145322523400391


In [8]:
# Linear-SVM

def linear_svm(train_x, test_x, train_y, test_y, params):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='neg_mean_squared_error', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)
    y_predicted_test = clf.predict(X_test_loc)
    print('--- Linear-SVM Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)

    return clf


print('--- Alcohol Data ---')
param = [
    {'C': np.arange(0.1, 4, 0.1),
     'epsilon': np.arange(6, 7, 0.1),
     'loss': ['epsilon_insensitive'],
     'fit_intercept': [True],
     'max_iter': [10000]}]

svm_alcohol = linear_svm(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y, param)
print('--- Covid Data ---')
param = [
    {'C': np.arange(0.1, 2, 0.1),
     'epsilon': np.arange(0, 0.5, 0.1),
     'loss': ['epsilon_insensitive'],
     'fit_intercept': [True],
     'max_iter': [10000]}]
svm_covid = linear_svm(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4], param)

--- Alcohol Data ---
--- Linear-SVM Global Results ---
R_squared: 0.32679070040166736
MAPE: 0.8362859104458693
RMSE: 20.152291230060342
MAE: 17.60905739305675
CORR: 0.6008017719603578
--- Covid Data ---
--- Linear-SVM Global Results ---
R_squared: 0.45212965093454494
MAPE: 0.34858657756993405
RMSE: 0.4483233831400799
MAE: 0.40808193869630205
CORR: 0.8472282490107078


In [9]:
# XGBoost

def xgboost_reg(train_x, test_x, train_y, test_y):
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': [20, 25, 30],
         'booster': ['gbtree'],
         'alpha': np.arange(0, 1, 0.1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 8, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, cv=5, scoring='neg_mean_squared_error')
    reg_xgb.fit(train_x, train_y)
    print(reg_xgb.best_params_)
    y_predicted_test = reg_xgb.predict(test_x)
    print('--- XGBoost Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)

    return reg_xgb


print('--- Alcohol Data ---')
xgb_alcohol = xgboost_reg(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
xgb_covid = xgboost_reg(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
{'alpha': 0.9, 'booster': 'gbtree', 'eval_metric': 'rmse', 'max_depth': 1, 'n_estimators': 20, 'objective': 'reg:squarederror'}
--- XGBoost Global Results ---
R_squared: 0.2635459433792877
MAPE: 0.8906545349968121
RMSE: 21.077650879894065
MAE: 18.788489587340006
CORR: 0.5393537103999396
--- Covid Data ---
{'alpha': 0.30000000000000004, 'booster': 'gbtree', 'eval_metric': 'rmse', 'max_depth': 1, 'n_estimators': 20, 'objective': 'reg:squarederror'}
--- XGBoost Global Results ---
R_squared: 0.5849734224083403
MAPE: 0.29537581786131245
RMSE: 0.39020285886847317
MAE: 0.3671043377656203
CORR: 0.8360733444229482


In [12]:
from sklearn.ensemble import RandomForestRegressor


# Random Forests

def random_forests(train_x, test_x, train_y, test_y):
    grid = [
        {'n_estimators': [50, 70, 100],
         'max_features': ['auto', 'sqrt'],
         'max_depth': [5, 10, 15, 20],
         'min_samples_split': [2, 4, 6],
         'min_samples_leaf': [1],
         'bootstrap': [True]}]

    rf = GridSearchCV(RandomForestRegressor(), param_grid=grid, cv=5, scoring='neg_mean_absolute_error')
    rf.fit(train_x, train_y)
    y_predicted_test = rf.predict(test_x)
    print(rf.best_params_)

    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)

    return rf


print('--- Alcohol Data ---')
rf_alcohol = random_forests(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
rf_covid = random_forests(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
{'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
R_squared: 0.2765356032956062
MAPE: 0.8753997885247349
RMSE: 20.890938910737965
MAE: 18.14007443744855
CORR: 0.5373393124699202
--- Covid Data ---
{'bootstrap': True, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 50}
R_squared: 0.5993705681849972
MAPE: 0.2609481908363832
RMSE: 0.3833751138819075
MAE: 0.3388118065623275
CORR: 0.8078009577561643


In [19]:
import keras.layers as layer
from keras.models import Sequential


# LSTM 3-Layer - Recurrent Neural Network

def lstm_rnn(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    model = Sequential([
        layer.LSTM(40, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        layer.Dropout(0.25),
        layer.LSTM(units=25, return_sequences=True),
        layer.Dropout(0.20),
        layer.LSTM(units=10, return_sequences=False),
        layer.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=25, batch_size=8, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    utils.eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=True)

    return model


print('--- Alcohol Data ---')
lstm_alcohol = lstm_rnn(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
lstm_covid = lstm_rnn(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
R_squared: 0.23931259563792195
MAPE: 0.8657009038974556
RMSE: 21.421628826528025
MAE: 18.923568783937046
CORR: 0.5309723281379627
--- Covid Data ---
R_squared: 0.5606914602287327
MAPE: 0.16739411995961115
RMSE: 0.40145541040482335
MAE: 0.24550816645989051
CORR: 0.8166430188791403


In [20]:
# LSTM 1-Layer - Recurrent Neural Network

def one_lstm_rnn(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    model = Sequential([
        layer.LSTM(64, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        layer.Dropout(0.25),
        layer.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=15, batch_size=8, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    utils.eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=True)

    return model


print('--- Alcohol Data ---')
lstm1_alcohol = one_lstm_rnn(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
lstm1_covid = one_lstm_rnn(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
R_squared: 0.042286692130217296
MAPE: 0.8605215563564853
RMSE: 24.036273580928672
MAE: 20.984628191599008
CORR: 0.5387518857217988
--- Covid Data ---
R_squared: 0.5215184318054735
MAPE: 0.24970519237029248
RMSE: 0.4189720954121964
MAE: 0.3406590956908006
CORR: 0.7391590502727143


### 2. Evaluating Performance on Entire Dataset (Alcohol)

In [None]:
r2_elastic, r2_svm, r2_one_lstm, r2_xgb, r2_rf, r2_lstm, r2_mtgnn = ([] for _ in range(7))
rmse_elastic, rmse_svm, rmse_one_lstm, rmse_xgb, rmse_rf, rmse_lstm, rmse_mtgnn = ([] for _ in range(7))
mae_elastic, mae_svm, mae_one_lstm, mae_xgb, mae_rf, mae_lstm, mae_mtgnn = ([] for _ in range(7))

f = open("output_nomothethic_a.txt", "a")
f.write('- - - PER INDIVIDUAL RESULTS GLOBAL MODELS - - -\n')

for x in range(len(test_df)):
    # Build and evaluate a model for every single patient

    test_alcohol_X, test_alcohol_y = prepare_data(x, test_df)
    # Elastic-Net (baseline)
    r2, rmse, mae = utils.eval_results(test_alcohol_y, elastic_alcohol.predict(test_alcohol_X), False)

    # Elastic-Net metrics
    r2_elastic.append(max(0, r2))
    rmse_elastic.append(rmse)
    mae_elastic.append(mae)

    f.write("Patient ID: %s\n" % test_df[x]['ID'][0])
    f.write('\n')
    f.write('--- Elastic-Net ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # Linear-SVM

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    r2, rmse, mae = utils.eval_results(test_alcohol_y, svm_alcohol.predict(test_alcohol_X), False)
    # Linear-SVM metrics
    r2_svm.append(max(0, r2))
    rmse_svm.append(rmse)
    mae_svm.append(mae)

    f.write('--- Linear-SVM ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # XGBoost Regression
    r2, rmse, mae = utils.eval_results(test_alcohol_y, xgb_alcohol.predict(test_alcohol_X), False)
    # XGBoost metrics
    r2_xgb.append(max(0, r2))
    rmse_xgb.append(rmse)
    mae_xgb.append(mae)

    f.write('--- XGBoost ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # RF
    r2, rmse, mae = utils.eval_results(test_alcohol_y, rf_alcohol.predict(test_alcohol_X), False)
    # RF metrics
    r2_rf.append(max(0, r2))
    rmse_rf.append(rmse)
    mae_rf.append(mae)

    f.write('--- Random Forests ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 3-Layer RNN
    r2, rmse, mae = utils.eval_results(test_alcohol_y, lstm_alcohol.predict(test_alcohol_X).flatten(), False)
    # LSTM metrics
    r2_lstm.append(max(0, r2))
    rmse_lstm.append(rmse)
    mae_lstm.append(mae)

    f.write('--- LSTM RNN ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 1-Layer RNN
    r2, rmse, mae = utils.eval_results(test_alcohol_y, lstm1_alcohol.predict(test_alcohol_X).flatten(), False)
    # LSTM metrics
    r2_one_lstm.append(max(0, r2))
    rmse_one_lstm.append(rmse)
    mae_one_lstm.append(mae)

    f.write('--- 1-LSTM RNN ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

f.close()
print('---- Elastic-Net Results ----')
utils.average_metrics(r2_elastic, rmse_elastic, mae_elastic)
print('---------------------------------')
print('---- Linear SVM Results ----')
utils.average_metrics(r2_svm, rmse_svm, mae_svm)
print('---------------------------------')
print('---- XGBoost Results ----')
utils.average_metrics(r2_xgb, rmse_xgb, mae_xgb)
print('---------------------------------')
print('---- Random Forest Results ----')
utils.average_metrics(r2_rf, rmse_rf, mae_rf)
print('---------------------------------')
print('---- LSTM Results ----')
utils.average_metrics(r2_lstm, rmse_lstm, mae_lstm)
print('---------------------------------')
print('---- 1-LSTM Results ----')
utils.average_metrics(r2_one_lstm, rmse_one_lstm, mae_one_lstm)
print('---------------------------------')

### 2.1 Evaluatin Performance on Entire Dataset (COVID-19)

In [None]:
r2_elastic, r2_svm, r2_one_lstm, r2_xgb, r2_rf, r2_lstm, r2_mtgnn = ([] for _ in range(7))
rmse_elastic, rmse_svm, rmse_one_lstm, rmse_xgb, rmse_rf, rmse_lstm, rmse_mtgnn = ([] for _ in range(7))
mae_elastic, mae_svm, mae_one_lstm, mae_xgb, mae_rf, mae_lstm, mae_mtgnn = ([] for _ in range(7))

f = open("output_nomothethic_c.txt", "a")
f.write('- - - PER INDIVIDUAL RESULTS GLOBAL MODELS - - -\n')

for z in range(len(covid_train_x_list)):
    # Build and evaluate a model for every single patient

    # Elastic-Net (baseline)
    r2, rmse, mae = utils.eval_results(covid_test_y_list[z], elastic_covid.predict(covid_test_x_list[z]), False)

    # Elastic-Net metrics
    r2_elastic.append(max(0, r2))
    rmse_elastic.append(rmse)
    mae_elastic.append(mae)

    f.write("Patient ID: %s\n" % z)
    f.write('\n')
    f.write('--- Elastic-Net ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # Linear-SVM

    params = [
        {'C': np.arange(0.1, 2, 0.1),
         'epsilon': np.arange(0, 0.5, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    r2, rmse, mae = utils.eval_results(covid_test_y_list[z], svm_covid.predict(covid_test_x_list[z]), False)
    # Linear-SVM metrics
    r2_svm.append(max(0, r2))
    rmse_svm.append(rmse)
    mae_svm.append(mae)

    f.write('--- Linear-SVM ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # XGBoost Regression
    r2, rmse, mae = utils.eval_results(covid_test_y_list[z], xgb_covid.predict(covid_test_x_list[z]), False)
    # XGBoost metrics
    r2_xgb.append(max(0, r2))
    rmse_xgb.append(rmse)
    mae_xgb.append(mae)

    f.write('--- XGBoost ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # RF
    r2, rmse, mae = utils.eval_results(covid_test_y_list[z], rf_covid.predict(covid_test_x_list[z]), False)
    # RF metrics
    r2_rf.append(max(0, r2))
    rmse_rf.append(rmse)
    mae_rf.append(mae)

    f.write('--- Random Forests ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 3-Layer RNN
    r2, rmse, mae = utils.eval_results(covid_test_y_list[z], lstm_covid.predict(covid_test_x_list[z]), False)
    # LSTM metrics
    r2_lstm.append(max(0, r2))
    rmse_lstm.append(rmse)
    mae_lstm.append(mae)

    f.write('--- LSTM RNN ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 1-Layer RNN
    r2, rmse, mae = utils.eval_results(covid_test_y_list[z], lstm1_covid.predict(covid_test_x_list[z]), False)
    # LSTM metrics
    r2_one_lstm.append(max(0, r2))
    rmse_one_lstm.append(rmse)
    mae_one_lstm.append(mae)

    f.write('--- 1-LSTM RNN ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

f.close()
print('---- Elastic-Net Results ----')
utils.average_metrics(r2_elastic, rmse_elastic, mae_elastic)
print('---------------------------------')
print('---- Linear SVM Results ----')
utils.average_metrics(r2_svm, rmse_svm, mae_svm)
print('---------------------------------')
print('---- XGBoost Results ----')
utils.average_metrics(r2_xgb, rmse_xgb, mae_xgb)
print('---------------------------------')
print('---- Random Forest Results ----')
utils.average_metrics(r2_rf, rmse_rf, mae_rf)
print('---------------------------------')
print('---- LSTM Results ----')
utils.average_metrics(r2_lstm, rmse_lstm, mae_lstm)
print('---------------------------------')
print('---- 1-LSTM Results ----')
utils.average_metrics(r2_one_lstm, rmse_one_lstm, mae_one_lstm)
print('---------------------------------')