 # 1-step Forecasting with linear and non-linear models (Nomothetic)

In [10]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import utils

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [11]:
# Reading alcohol data
train_df, test_df, data_raw_list = utils.load_alcohol()

combined_data = []

train_alcohol = pd.concat(train_df, ignore_index=True)
train_alcohol_X = train_alcohol.drop(train_alcohol.columns[range(0, 61)], axis=1).fillna(0)
train_alcohol_y = train_alcohol['craving']


def prepare_data(idx, test_list):
    # print('Patient ID:', test_list[idx]['ID'][0])
    X_test = test_list[idx].drop(test_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_test = test_list[idx]['craving']
    return X_test, y_test


In [12]:
# Loading train and test data of covid patients
covid_train_x_list, covid_test_x_list, covid_train_y_list, covid_test_y_list = utils.patients_covid()

covid_train_X = pd.concat(covid_train_x_list, ignore_index=True)
covid_train_y = pd.concat(covid_train_y_list, ignore_index=True)

Patient included in study:
[3, 5, 8, 11, 14, 15, 16, 24, 25, 26, 27, 31, 34, 35, 37, 39, 41, 42, 46, 50, 53, 54, 59, 63, 65, 66, 70, 72, 77]


# 1. Nomothetic Models Regression

In [13]:
# Elastic-Net

def elastic_net(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.01, 0.6, 0.05)
    elastic_reg = lm.ElasticNetCV(alphas=np.arange(0.01, 20, 0.05), l1_ratio=l1_ratios, cv=5, max_iter=100000,
                                  fit_intercept=True)
    elastic_reg.fit(X_train_loc, train_y)
    y_predicted_test = elastic_reg.predict(X_test_loc)

    # print('--- Elastic-Net Global Results ---')
    # utils.eval_results(actual=test_y, predicted=y_predicted_test, show=False)

    return elastic_reg


print('--- Alcohol Data ---')
test_alcohol_X, test_alcohol_y = prepare_data(1, test_df)
elastic_alcohol = elastic_net(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('Trained')
print('--- Covid Data ---')
elastic_covid = elastic_net(covid_train_X, covid_test_x_list[0], covid_train_y, covid_test_y_list[0])
print('Trained')

--- Alcohol Data ---
Trained
--- Covid Data ---
Trained


In [14]:
# Linear-SVM

def linear_svm(train_x, test_x, train_y, test_y, params):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='neg_mean_squared_error', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)
    y_predicted_test = clf.predict(X_test_loc)
    # print('--- Linear-SVM Global Results ---')
    # utils.eval_results(actual=test_y, predicted=y_predicted_test, show=False)

    return clf


print('--- Alcohol Data ---')
param = [
    {'C': np.arange(0.1, 4, 0.1),
     'epsilon': np.arange(6, 7, 0.1),
     'loss': ['epsilon_insensitive'],
     'fit_intercept': [True],
     'max_iter': [10000]}]

svm_alcohol = linear_svm(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y, param)
print('Trained')
print('--- Covid Data ---')
param = [
    {'C': np.arange(0.1, 2, 0.1),
     'epsilon': np.arange(0, 0.5, 0.1),
     'loss': ['epsilon_insensitive'],
     'fit_intercept': [True],
     'max_iter': [10000]}]
svm_covid = linear_svm(covid_train_X, covid_test_x_list[0], covid_train_y, covid_test_y_list[0], param)
print('Trained')

--- Alcohol Data ---


KeyboardInterrupt: 

In [None]:
# XGBoost

def xgboost_reg(train_x, test_x, train_y, test_y):
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': [15, 20, 25, 30],
         'booster': ['gbtree'],
         'alpha': np.arange(0, 1, 0.1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 8, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, cv=5, scoring='neg_mean_squared_error')
    reg_xgb.fit(train_x, train_y)
    print(reg_xgb.best_params_)
    y_predicted_test = reg_xgb.predict(test_x)

    # print('--- XGBoost Global Results ---')
    # utils.eval_results(actual=test_y, predicted=y_predicted_test, show=False)

    return reg_xgb


print('--- Alcohol Data ---')
xgb_alcohol = xgboost_reg(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('Trained')
print('--- Covid Data ---')
xgb_covid = xgboost_reg(covid_train_X, covid_test_x_list[0], covid_train_y, covid_test_y_list[0])
print('Trained')

In [None]:
from sklearn.ensemble import RandomForestRegressor


# Random Forests

def random_forests(train_x, test_x, train_y, test_y):
    grid = [
        {'n_estimators': [50, 70, 100],
         'max_features': ['auto', 'sqrt'],
         'max_depth': [5, 10, 15, 20],
         'min_samples_split': [2, 4, 6],
         'min_samples_leaf': [1],
         'bootstrap': [True]}]

    rf = GridSearchCV(RandomForestRegressor(), param_grid=grid, cv=5, scoring='neg_mean_absolute_error')
    rf.fit(train_x, train_y)
    y_predicted_test = rf.predict(test_x)
    print(rf.best_params_)

    # utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)

    return rf


print('--- Alcohol Data ---')
rf_alcohol = random_forests(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('Trained')
print('--- Covid Data ---')
rf_covid = random_forests(covid_train_X, covid_test_x_list[0], covid_train_y, covid_test_y_list[0])
print('Trained')

In [None]:
import keras.layers as layer
from keras.models import Sequential


# LSTM 3-Layer - Recurrent Neural Network

def lstm_rnn(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    model = Sequential([
        layer.LSTM(40, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        layer.Dropout(0.25),
        layer.LSTM(units=25, return_sequences=True),
        layer.Dropout(0.20),
        layer.LSTM(units=10, return_sequences=False),
        layer.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=25, batch_size=8, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    # utils.eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=False)

    return model


print('--- Alcohol Data ---')
lstm_alcohol = lstm_rnn(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('Trained')
print('--- Covid Data ---')
lstm_covid = lstm_rnn(covid_train_X, covid_test_x_list[0], covid_train_y, covid_test_y_list[0])
print('Trained')

In [None]:
# LSTM 1-Layer - Recurrent Neural Network

def one_lstm_rnn(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    model = Sequential([
        layer.LSTM(16, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        layer.Dropout(0.25),
        layer.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=15, batch_size=8, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    # utils.eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=False)

    return model


print('--- Alcohol Data ---')
lstm1_alcohol = one_lstm_rnn(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('Trained')
print('--- Covid Data ---')
lstm1_covid = one_lstm_rnn(covid_train_X, covid_test_x_list[0], covid_train_y, covid_test_y_list[0])
print('Trained')

### 2. Evaluating Performance on Entire Dataset (Alcohol)

In [None]:
r2_elastic, r2_svm, r2_one_lstm, r2_xgb, r2_rf, r2_lstm, r2_mtgnn = ([] for _ in range(7))
rmse_elastic, rmse_svm, rmse_one_lstm, rmse_xgb, rmse_rf, rmse_lstm, rmse_mtgnn = ([] for _ in range(7))
mae_elastic, mae_svm, mae_one_lstm, mae_xgb, mae_rf, mae_lstm, mae_mtgnn = ([] for _ in range(7))

f = open("output_nomothethic_a.txt", "a")
f.write('- - - PER INDIVIDUAL RESULTS GLOBAL MODELS - - -\n')

for x in range(len(test_df)):
    # Build and evaluate a model for every single patient

    test_alcohol_X, test_alcohol_y = prepare_data(x, test_df)
    # Elastic-Net (baseline)
    r2, rmse, mae = utils.eval_results(test_alcohol_y,
                                       elastic_alcohol.predict(utils.standardize(test_alcohol_X).fillna(0)), False)

    # Elastic-Net metrics
    r2_elastic.append(max(0, r2))
    rmse_elastic.append(rmse)
    mae_elastic.append(mae)

    f.write("Patient ID: %s\n" % test_df[x]['ID'][0])
    f.write('\n')
    f.write('--- Elastic-Net ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # Linear-SVM

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    r2, rmse, mae = utils.eval_results(test_alcohol_y, svm_alcohol.predict(utils.standardize(test_alcohol_X).fillna(0)),
                                       False)
    # Linear-SVM metrics
    r2_svm.append(max(0, r2))
    rmse_svm.append(rmse)
    mae_svm.append(mae)

    f.write('--- Linear-SVM ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # XGBoost Regression
    r2, rmse, mae = utils.eval_results(test_alcohol_y, xgb_alcohol.predict(test_alcohol_X), False)
    # XGBoost metrics
    r2_xgb.append(max(0, r2))
    rmse_xgb.append(rmse)
    mae_xgb.append(mae)

    f.write('--- XGBoost ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # RF
    r2, rmse, mae = utils.eval_results(test_alcohol_y, rf_alcohol.predict(test_alcohol_X), False)
    # RF metrics
    r2_rf.append(max(0, r2))
    rmse_rf.append(rmse)
    mae_rf.append(mae)

    f.write('--- Random Forests ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 3-Layer RNN
    r2, rmse, mae = utils.eval_results(test_alcohol_y, lstm_alcohol.predict(
        utils.standardize(test_alcohol_X).fillna(0).values).flatten(), False)
    # LSTM metrics
    r2_lstm.append(max(0, r2))
    rmse_lstm.append(rmse)
    mae_lstm.append(mae)

    f.write('--- LSTM RNN ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 1-Layer RNN
    r2, rmse, mae = utils.eval_results(test_alcohol_y, lstm1_alcohol.predict(
        utils.standardize(test_alcohol_X).fillna(0).values).flatten(), False)
    # LSTM metrics
    r2_one_lstm.append(max(0, r2))
    rmse_one_lstm.append(rmse)
    mae_one_lstm.append(mae)

    f.write('--- 1-LSTM RNN ---\n')
    f.write("R_squared: %s\n" % max(0, r2))
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

f.close()
print('---- Elastic-Net Results ----')
utils.average_metrics(r2_elastic, rmse_elastic, mae_elastic)
print('---------------------------------')
print('---- Linear SVM Results ----')
utils.average_metrics(r2_svm, rmse_svm, mae_svm)
print('---------------------------------')
print('---- XGBoost Results ----')
utils.average_metrics(r2_xgb, rmse_xgb, mae_xgb)
print('---------------------------------')
print('---- Random Forest Results ----')
utils.average_metrics(r2_rf, rmse_rf, mae_rf)
print('---------------------------------')
print('---- LSTM Results ----')
utils.average_metrics(r2_lstm, rmse_lstm, mae_lstm)
print('---------------------------------')
print('---- 1-LSTM Results ----')
utils.average_metrics(r2_one_lstm, rmse_one_lstm, mae_one_lstm)
print('---------------------------------')

### 2.1 Evaluatin Performance on Entire Dataset (COVID-19)

In [None]:
mape_elastic, mape_svm, mape_one_lstm, mape_xgb, mape_rf, mape_lstm, mape_mtgnn = ([] for _ in range(7))
rmse_elastic, rmse_svm, rmse_one_lstm, rmse_xgb, rmse_rf, rmse_lstm, rmse_mtgnn = ([] for _ in range(7))
mae_elastic, mae_svm, mae_one_lstm, mae_xgb, mae_rf, mae_lstm, mae_mtgnn = ([] for _ in range(7))

f = open("output_nomothethic_c.txt", "a")
f.write('- - - PER INDIVIDUAL RESULTS GLOBAL MODELS - - -\n')

for z in range(len(covid_train_x_list)):
    # Build and evaluate a model for every single patient

    # Elastic-Net (baseline)
    mape, rmse, mae = utils.eval_results_covid(covid_test_y_list[z],
                                               elastic_covid.predict(utils.standardize(covid_test_x_list[z]).fillna(0)),
                                               False)

    # Elastic-Net metrics
    mape_elastic.append(mape)
    rmse_elastic.append(rmse)
    mae_elastic.append(mae)

    f.write("Patient ID: %s\n" % z)
    f.write('\n')
    f.write('--- Elastic-Net ---\n')
    f.write("MAPE: %s\n" % mape)
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # Linear-SVM

    params = [
        {'C': np.arange(0.1, 2, 0.1),
         'epsilon': np.arange(0, 0.5, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    mape, rmse, mae = utils.eval_results_covid(covid_test_y_list[z],
                                               svm_covid.predict(utils.standardize(covid_test_x_list[z]).fillna(0)),
                                               False)
    # Linear-SVM metrics
    mape_svm.append(mape)
    rmse_svm.append(rmse)
    mae_svm.append(mae)

    f.write('--- Linear-SVM ---\n')
    f.write("MAPE: %s\n" % mape)
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # XGBoost Regression
    mape, rmse, mae = utils.eval_results_covid(covid_test_y_list[z], xgb_covid.predict(covid_test_x_list[z]), False)
    # XGBoost metrics
    mape_xgb.append(mape)
    rmse_xgb.append(rmse)
    mae_xgb.append(mae)

    f.write('--- XGBoost ---\n')
    f.write("MAPE: %s\n" % mape)
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # RF
    mape, rmse, mae = utils.eval_results_covid(covid_test_y_list[z], rf_covid.predict(covid_test_x_list[z]), False)
    # RF metrics
    mape_rf.append(mape)
    rmse_rf.append(rmse)
    mae_rf.append(mae)

    f.write('--- Random Forests ---\n')
    f.write("MAPE: %s\n" % mape)
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 3-Layer RNN
    mape, rmse, mae = utils.eval_results_covid(covid_test_y_list[z], lstm_covid.predict(
        utils.standardize(covid_test_x_list[z]).fillna(0).values).flatten(), False)
    # LSTM metrics
    mape_lstm.append(mape)
    rmse_lstm.append(rmse)
    mae_lstm.append(mae)

    f.write('--- LSTM RNN ---\n')
    f.write("MAPE: %s\n" % mape)
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

    # LSTM 1-Layer RNN
    mape, rmse, mae = utils.eval_results_covid(covid_test_y_list[z], lstm1_covid.predict(
        utils.standardize(covid_test_x_list[z]).fillna(0).values).flatten(), False)
    # LSTM metrics
    mape_one_lstm.append(mape)
    rmse_one_lstm.append(rmse)
    mae_one_lstm.append(mae)

    f.write('--- 1-LSTM RNN ---\n')
    f.write("MAPE: %s\n" % mape)
    f.write("RMSE: %s\n" % rmse)
    f.write("MAE: %s\n" % mae)
    f.write('\n')

f.close()
print('---- Elastic-Net Results ----')
utils.average_metrics_covid(mape_elastic, rmse_elastic, mae_elastic)
print('---------------------------------')
print('---- Linear SVM Results ----')
utils.average_metrics_covid(mape_svm, rmse_svm, mae_svm)
print('---------------------------------')
print('---- XGBoost Results ----')
utils.average_metrics_covid(mape_xgb, rmse_xgb, mae_xgb)
print('---------------------------------')
print('---- Random Forest Results ----')
utils.average_metrics_covid(mape_rf, rmse_rf, mae_rf)
print('---------------------------------')
print('---- LSTM Results ----')
utils.average_metrics_covid(mape_lstm, rmse_lstm, mae_lstm)
print('---------------------------------')
print('---- 1-LSTM Results ----')
utils.average_metrics_covid(mape_one_lstm, rmse_one_lstm, mae_one_lstm)
print('---------------------------------')