# 1-step Forecasting with linear and non-linear models (Nomothetic)

In [18]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import utils

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [19]:
# Reading alcohol data
train_df, test_df, data_raw_list = utils.load_alcohol()

combined_data = []

for i in range(len(train_df)):
    train = train_df[i]
    test = test_df[i]
    # Combine both train and test sets since the initial split was 50/50
    combined = pd.concat([train, test])
    # Sort by date
    combined['start'] = pd.to_datetime(combined['start'])
    combined = combined.sort_values(by='start')
    combined_data.append(combined)

combined_data[0].head()

Unnamed: 0.1,Unnamed: 0,ID,start,finish,drinks,comfortable,stressed,down,calm,pressure,...,cosT.1,sinT.1,cos2T.1,sin2T.1,cosW.1,sinW.1,dayvar.1,beepvar.1,filter.1,consec.1
0,1,1,2018-02-06 16:20:00,2/6/2018 16:22,3,7.382609,-9.817391,10.843478,-37.791304,6.173913,...,1.0,0.0,1.0,0.0,1.0,0.0,1,4,0,1
31,2,1,2018-02-06 18:54:00,2/6/2018 18:58,0,14.382609,47.182609,7.843478,7.208696,10.173913,...,0.892979,0.450098,0.594823,0.803857,0.997777,0.066647,1,5,0,2
1,3,1,2018-02-06 20:08:00,2/6/2018 20:22,0,15.382609,12.182609,10.843478,20.208696,18.173913,...,0.41866,0.908143,-0.649448,0.760406,0.986795,0.161973,1,6,0,3
2,4,1,2018-02-06 22:29:00,2/6/2018 22:46,0,21.382609,-5.817391,-2.156522,8.208696,5.173913,...,0.108867,0.994056,-0.976296,0.21644,0.978277,0.207302,1,7,0,4
36,5,1,2018-02-07 10:52:00,2/7/2018 11:23,0,-11.617391,5.182609,0.843478,-24.791304,-4.826087,...,0.043619,-0.999048,-0.996195,-0.087156,0.77793,0.628351,2,1,0,7


In [52]:
# Reading covid data
covid_data = utils.load_covid()

covid_data = covid_data.drop(columns=['ID', 'ID_lag', 'time', 'time_lag', 'Duration', 'Duration_lag'])

covid_data = covid_data.dropna()

covid_X = covid_data.drop(covid_data.columns[range(0, 19)], axis=1)
covid_y = covid_data['C19_worry']

covid_train_x, covid_test_x, covid_train_y, covid_test_y = train_test_split(covid_X, covid_y, test_size=0.3, random_state=42)

# 1. Nomothetic Models Regression

In [54]:
# Reading in the train data
train_global = pd.read_csv('data/alcohol_data/global (mean-centered)/n.train.csv')
train_global_x = train_global.drop(train_global.columns[range(0, 60)], axis=1).fillna(0)
# train_global_x = train_global_x.drop(train_global_x.columns[range(20, 57)], axis=1).fillna(0)
train_global_y = train_global['craving']

train_global_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326 entries, 0 to 1325
Data columns (total 57 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   drinks.1            1326 non-null   int64  
 1   comfortable.1       1326 non-null   float64
 2   stressed.1          1326 non-null   float64
 3   down.1              1326 non-null   float64
 4   calm.1              1326 non-null   float64
 5   pressure.1          1326 non-null   float64
 6   enthusiastic.1      1326 non-null   float64
 7   happy.1             1326 non-null   float64
 8   conflict.1          1326 non-null   float64
 9   craving.1           1326 non-null   float64
 10  impulsive.1         1326 non-null   float64
 11  posexpect.1         1326 non-null   float64
 12  peerpercent.1       1326 non-null   float64
 13  wantdrink.1         1326 non-null   float64
 14  delay_grat.1        1326 non-null   float64
 15  angry.1             1326 non-null   float64
 16  drinkp

In [55]:
# Reading in the test data
test_global = pd.read_csv('data/alcohol_data/global (mean-centered)/n.test.csv')
test_global_x = test_global.drop(test_global.columns[range(0, 60)], axis=1).fillna(0)
# test_global_x = test_global_x.drop(test_global_x.columns[range(20, 57)], axis=1).fillna(0)
test_global_y = test_global['craving']

X_test_list = []
y_test_list = []
unique_ids = np.unique(test_global['ID'])
for i in range(len(unique_ids)):
    individual = test_global.loc[test_global['ID'] == unique_ids[i]]
    individual_x = individual.drop(individual.columns[range(0, 60)], axis=1).fillna(0)
    # X_test_list.append(individual_x.drop(individual_x.columns[range(20, 57)], axis=1).fillna(0))
    X_test_list.append(individual_x)
    y_test_list.append(individual['craving'])

X_test_list[10].head()

Unnamed: 0,drinks.1,comfortable.1,stressed.1,down.1,calm.1,pressure.1,enthusiastic.1,happy.1,conflict.1,craving.1,...,cos2T.1,sin2T.1,cosW.1,sinW.1,dayvar.1,beepvar.1,filter.1,consec.1,NA.,NA..1
473,0,32.144068,-37.20339,-28.686441,8.09322,-2.771186,44.084746,16.728814,-9.542373,-21.728814,...,1.0,0.0,1.0,0.0,1,4,0,1,0.0,0.0
474,1,21.144068,-37.20339,-25.686441,-27.90678,-2.771186,20.084746,25.728814,-9.542373,-21.728814,...,0.48481,-0.87462,0.681086,0.732203,2,3,0,9,0.0,0.0
475,2,32.144068,-37.20339,-27.686441,34.09322,-2.771186,35.084746,35.728814,-9.542373,28.271186,...,-0.190809,0.981627,0.520371,0.85394,2,6,0,12,0.0,0.0
476,3,32.144068,-35.20339,-25.686441,45.09322,-2.771186,23.084746,35.728814,-8.542373,-21.728814,...,-0.987688,0.156435,0.443965,0.896044,2,7,0,13,0.0,0.0
477,0,32.144068,-37.20339,-26.686441,22.09322,-0.771186,-40.915254,-5.271186,56.457627,-21.728814,...,-0.920505,0.390731,0.028669,0.999589,3,1,0,16,0.0,0.0


In [56]:
# Elastic-Net
def elastic_net(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.01, 0.6, 0.05)
    elastic_reg = lm.ElasticNetCV(alphas=np.arange(0.01, 20, 0.05), l1_ratio=l1_ratios, cv=5, max_iter=100000,
                                  fit_intercept=True)
    elastic_reg.fit(X_train_loc, train_y)
    y_predicted_test = elastic_reg.predict(X_test_loc)

    print('--- Elastic-Net Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)

print('--- Alcohol Data ---')
elastic_net(train_global_x, test_global_x, train_global_y, test_global_y)
print('--- Covid Data ---')
elastic_net(covid_train_x, covid_test_x, covid_train_y, covid_test_y)

--- Alcohol Data ---
--- Elastic-Net Global Results ---
R_squared: 0.3111495401741893
MAPE: 1.7065650938124857
RMSE: 14.320462960174392
MAE: 9.602303099669058
CORR: 0.5697637975019556
--- Covid Data ---
--- Elastic-Net Global Results ---
R_squared: 0.4589016276507417
MAPE: 0.30795570817365764
RMSE: 0.6182030460549277
MAE: 0.4566836537280723
CORR: 0.678954617048342


In [57]:
# Linear-SVM
def linear_svm(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='r2', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)
    y_predicted_test = clf.predict(X_test_loc)
    print('--- Linear-SVM Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)


print('--- Alcohol Data ---')
linear_svm(train_global_x, test_global_x, train_global_y, test_global_y)
print('--- Covid Data ---')
linear_svm(covid_train_x, covid_test_x, covid_train_y, covid_test_y)

--- Alcohol Data ---
--- Linear-SVM Global Results ---
R_squared: 0.31527268796699814
MAPE: 1.7852204591663705
RMSE: 14.277540723152345
MAE: 9.304087984492881
CORR: 0.5701790096176893
--- Covid Data ---
--- Linear-SVM Global Results ---
R_squared: -4.197145031083166
MAPE: 1.0
RMSE: 1.9159115035483552
MAE: 1.7217496962332928
CORR: nan


  c /= stddev[:, None]
  c /= stddev[None, :]


In [58]:
# XGBoost
def xgboost_reg(train_x, test_x, train_y, test_y):
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': [20, 25, 30],
         'booster': ['gbtree'],
         'alpha': np.arange(0, 1, 0.1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 8, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, cv=5, scoring='r2')
    reg_xgb.fit(train_x, train_y)
    print(reg_xgb.best_params_)
    y_predicted_test = reg_xgb.predict(test_x)
    print('--- XGBoost Global Results ---')
    utils.eval_results(actual=test_y, predicted=y_predicted_test, show=True)


print('--- Alcohol Data ---')
xgboost_reg(train_global_x, test_global_x, train_global_y, test_global_y)
print('--- Covid Data ---')
xgboost_reg(covid_train_x, covid_test_x, covid_train_y, covid_test_y)

--- Alcohol Data ---
{'alpha': 0.0, 'booster': 'gbtree', 'eval_metric': 'rmse', 'max_depth': 1, 'n_estimators': 25, 'objective': 'reg:squarederror'}
--- XGBoost Global Results ---
R_squared: 0.2841351799677859
MAPE: 1.5832776336563623
RMSE: 14.598562453412658
MAE: 9.749247074954727
CORR: 0.5439972489072176
--- Covid Data ---
{'alpha': 0.9, 'booster': 'gbtree', 'eval_metric': 'rmse', 'max_depth': 2, 'n_estimators': 20, 'objective': 'reg:squarederror'}
--- XGBoost Global Results ---
R_squared: 0.45491114719617065
MAPE: 0.31141959869364294
RMSE: 0.6204784138366619
MAE: 0.46572753392331945
CORR: 0.6756696659280473


In [60]:
import keras.layers as layer
from keras.models import Sequential

# LSTM - Recurrent Neural Network
def lstm_rnn(train_x, test_x, train_y, test_y):
    X_train_loc = utils.standardize(train_x).fillna(0)
    X_test_loc = utils.standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    model = Sequential([
        layer.LSTM(40, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        layer.Dropout(0.25),
        layer.LSTM(units=25, return_sequences=True),
        layer.Dropout(0.20),
        layer.LSTM(units=10, return_sequences=False),
        layer.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=30, batch_size=8, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    utils.eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=True)

print('--- Alcohol Data ---')
lstm_rnn(train_global_x, test_global_x, train_global_y, test_global_y)
print('--- Covid Data ---')
lstm_rnn(covid_train_x, covid_test_x, covid_train_y, covid_test_y)

--- Alcohol Data ---
R_squared: 0.22291570165871555
MAPE: 1.9042010916349585
RMSE: 15.209980206167824
MAE: 9.63807734271637
CORR: 0.4967365047256148
--- Covid Data ---
R_squared: 0.4163023526541726
MAPE: 0.2454156702542025
RMSE: 0.6420768300390796
MAE: 0.3953051690922038
CORR: 0.6645473152084025
