# 1-step Forecasting with linear and non-linear models (Nomothetic)

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import utils

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [2]:
# Reading alcohol data
train_df, test_df, data_raw_list = utils.load_alcohol()

combined_data = []

train_alcohol = pd.concat(train_df, ignore_index=True)
train_alcohol_X = train_alcohol.drop(train_alcohol.columns[range(0, 61)], axis=1).fillna(0)
train_alcohol_y = train_alcohol['craving']


def prepare_data(idx, test_list):
    # print('Patient ID:', test_list[idx]['ID'][0])
    X_test = test_list[idx].drop(test_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_test = test_list[idx]['craving']
    return X_test, y_test


In [3]:
# Loading train and test data of covid patients
covid_train_x_list, covid_test_x_list, covid_train_y_list, covid_test_y_list = utils.patients_covid()

covid_train_X = pd.concat(covid_train_x_list, ignore_index=True)
covid_train_y = pd.concat(covid_train_y_list, ignore_index=True)


Patient included in study:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 46, 48, 49, 50, 52, 53, 54, 55, 57, 58, 59, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 77, 78]


# 1. Nomothetic Models Regression

In [9]:
# Elastic-Net
def elastic_net(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.01, 0.6, 0.05)
    elastic_reg = lm.ElasticNetCV(alphas=np.arange(0.01, 20, 0.05), l1_ratio=l1_ratios, cv=5, max_iter=100000,
                                  fit_intercept=True)
    elastic_reg.fit(X_train_loc, train_y)
    y_predicted_test = elastic_reg.predict(X_test_loc)

    print('--- Elastic-Net Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)


print('--- Alcohol Data ---')
test_alcohol_X, test_alcohol_y = prepare_data(1, test_df)
elastic_net(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
elastic_net(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
--- Elastic-Net Global Results ---
R_squared: 0.3739585079648516
MAPE: 0.8727735940920615
RMSE: 20.49355174027907
MAE: 18.52613409884371
CORR: 0.611521469749715
--- Covid Data ---
--- Elastic-Net Global Results ---
R_squared: 0.6634627901021372
MAPE: 0.3886173418152433
RMSE: 0.4984005787041621
MAE: 0.45580592466727055
CORR: 0.8145322523400391


In [10]:
# Linear-SVM
def linear_svm(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='r2', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)
    y_predicted_test = clf.predict(X_test_loc)
    print('--- Linear-SVM Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)


print('--- Alcohol Data ---')
linear_svm(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
linear_svm(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
--- Linear-SVM Global Results ---
R_squared: 0.3692304162341966
MAPE: 0.8408454291054853
RMSE: 20.303494912671287
MAE: 17.95215955023266
CORR: 0.607643329786641
--- Covid Data ---
--- Linear-SVM Global Results ---
R_squared: nan
MAPE: 1.0
RMSE: 1.4411533842457842
MAE: 1.3076923076923077
CORR: nan


  c /= stddev[:, None]
  c /= stddev[None, :]


In [11]:
# XGBoost
def xgboost_reg(train_x, test_x, train_y, test_y):
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': [20, 25, 30],
         'booster': ['gbtree'],
         'alpha': np.arange(0, 1, 0.1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 8, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, cv=5, scoring='r2')
    reg_xgb.fit(train_x, train_y)
    print(reg_xgb.best_params_)
    y_predicted_test = reg_xgb.predict(test_x)
    print('--- XGBoost Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)


print('--- Alcohol Data ---')
xgboost_reg(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
xgboost_reg(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
{'alpha': 0.9, 'booster': 'gbtree', 'eval_metric': 'rmse', 'max_depth': 1, 'n_estimators': 20, 'objective': 'reg:squarederror'}
--- XGBoost Global Results ---
R_squared: 0.29090242492218193
MAPE: 0.8906545349968121
RMSE: 21.077650879894065
MAE: 18.788489587340006
CORR: 0.5393537103999396
--- Covid Data ---
{'alpha': 0.30000000000000004, 'booster': 'gbtree', 'eval_metric': 'rmse', 'max_depth': 1, 'n_estimators': 20, 'objective': 'reg:squarederror'}
--- XGBoost Global Results ---
R_squared: 0.6990186372545738
MAPE: 0.29537581786131245
RMSE: 0.39020285886847317
MAE: 0.3671043377656203
CORR: 0.8360733444229482


In [12]:
import keras.layers as layer
from keras.models import Sequential


# LSTM - Recurrent Neural Network
def lstm_rnn(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    model = Sequential([
        layer.LSTM(40, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        layer.Dropout(0.25),
        layer.LSTM(units=25, return_sequences=True),
        layer.Dropout(0.20),
        layer.LSTM(units=10, return_sequences=False),
        layer.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=30, batch_size=8, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    utils.eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=True)


print('--- Alcohol Data ---')
lstm_rnn(train_alcohol_X, test_alcohol_X, train_alcohol_y, test_alcohol_y)
print('--- Covid Data ---')
lstm_rnn(covid_train_X, covid_test_x_list[4], covid_train_y, covid_test_y_list[4])

--- Alcohol Data ---
R_squared: 0.2714364894166515
MAPE: 0.8856621244344209
RMSE: 21.457228176625897
MAE: 19.020308275194985
CORR: 0.5209956712072102
--- Covid Data ---
R_squared: 0.6231214283160843
MAPE: 0.1470502584408491
RMSE: 0.43600344400318214
MAE: 0.2363260525923509
CORR: 0.7893804078618143
