# 1-step Forecasting with linear and non-linear models

In [552]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsRegressor
import sklearn.metrics as metrics

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.vector_ar.vecm import coint_johansen

import load_data

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [553]:
train_df, test_df, data_raw_list = load_data.load_alcohol()

combined_data = []

for i in range(len(train_df)):
    train = train_df[i]
    test = test_df[i]
    # Combine both train and test sets since the initial split was 50/50
    combined = pd.concat([train, test])
    # Sort by date
    combined['start'] = pd.to_datetime(combined['start'])
    combined = combined.sort_values(by='start')
    combined_data.append(combined)

# Dataset with all individual's data
global_data = pd.concat(combined_data, ignore_index=True)
combined_data[0].head()

Unnamed: 0.1,Unnamed: 0,ID,start,finish,drinks,comfortable,stressed,down,calm,pressure,...,cosT.1,sinT.1,cos2T.1,sin2T.1,cosW.1,sinW.1,dayvar.1,beepvar.1,filter.1,consec.1
0,1,1,2018-02-06 16:20:00,2/6/2018 16:22,3,7.382609,-9.817391,10.843478,-37.791304,6.173913,...,1.0,0.0,1.0,0.0,1.0,0.0,1,4,0,1
31,2,1,2018-02-06 18:54:00,2/6/2018 18:58,0,14.382609,47.182609,7.843478,7.208696,10.173913,...,0.892979,0.450098,0.594823,0.803857,0.997777,0.066647,1,5,0,2
1,3,1,2018-02-06 20:08:00,2/6/2018 20:22,0,15.382609,12.182609,10.843478,20.208696,18.173913,...,0.41866,0.908143,-0.649448,0.760406,0.986795,0.161973,1,6,0,3
2,4,1,2018-02-06 22:29:00,2/6/2018 22:46,0,21.382609,-5.817391,-2.156522,8.208696,5.173913,...,0.108867,0.994056,-0.976296,0.21644,0.978277,0.207302,1,7,0,4
36,5,1,2018-02-07 10:52:00,2/7/2018 11:23,0,-11.617391,5.182609,0.843478,-24.791304,-4.826087,...,0.043619,-0.999048,-0.996195,-0.087156,0.77793,0.628351,2,1,0,7


## 1. Idiographic Models Regression

In [554]:
# Predict craving

# Collect train and test sets as was done in the paper
print('Train Patient ID:', train_df[12]['ID'][0])
print('Test Patient ID:', test_df[12]['ID'][0])
X_train = train_df[12].drop(train_df[12].columns[range(0, 61)], axis=1).fillna(0)
y_train = train_df[12]['craving']
X_test = test_df[12].drop(test_df[12].columns[range(0, 61)], axis=1).fillna(0)
y_test = test_df[12]['craving']


def standardize(data):
    for col in data.columns:
        data[col] = (data[col] - data[col].mean()) / np.std(data[col])
    return data


Train Patient ID: 14
Test Patient ID: 14


### 1.1 Lasso Regression

In [555]:
X_train = standardize(X_train).fillna(0)
X_test = standardize(X_test).fillna(0)

alphas = np.arange(0.1, 200, .1)
lasso_reg = lm.LassoCV(alphas=alphas, cv=5, max_iter=10000)
lasso_reg.fit(X_train, y_train)
y_predicted_test = lasso_reg.predict(X_test)

print('R_squared:', metrics.r2_score(y_test, y_predicted_test))
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predicted_test))
print('RMSE:', metrics.mean_squared_error(y_test, y_predicted_test, squared=False))

R_squared: 0.5821437826058486
MAPE: 1.17437925717085
RMSE: 8.39410535185786


### 1.2 Elastic-Net Regression

In [556]:
l1_ratios = np.arange(0.5, 1, 0.05)
alphas = np.arange(0.1, 100, .1)
elastic_reg = lm.ElasticNetCV(alphas=alphas, cv=5, l1_ratio=l1_ratios, max_iter=10000)
elastic_reg.fit(X_train, y_train)

y_predicted_test = elastic_reg.predict(X_test)

print('R_squared:', metrics.r2_score(y_test, y_predicted_test))
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predicted_test))
print('RMSE:', metrics.mean_squared_error(y_test, y_predicted_test, squared=False))

R_squared: 0.6330224747427082
MAPE: 0.9195012167791153
RMSE: 7.866484932707291


### 1.3 Linear SVM Regression

In [557]:
params = [
    {'C': np.arange(0.1, 4, 0.1),
     'epsilon': np.arange(6, 7, 0.1),
     'loss': ['epsilon_insensitive'],
     'fit_intercept': [False],
     'max_iter': [10000]}]

clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='r2', cv=5)
clf.fit(X_train, y_train)
best_params = clf.best_params_
print(best_params)

y_predicted_test = clf.predict(X_test)

print('R_squared:', metrics.r2_score(y_test, y_predicted_test))
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predicted_test))
print('RMSE:', metrics.mean_squared_error(y_test, y_predicted_test, squared=False))

{'C': 0.1, 'epsilon': 6.0, 'fit_intercept': False, 'loss': 'epsilon_insensitive', 'max_iter': 10000}
R_squared: 0.5662840357157135
MAPE: 0.9496389517582751
RMSE: 8.551921098644316


### 1.4 K-NN Regression

In [566]:
X_train = train_df[12].drop(train_df[12].columns[range(0, 61)], axis=1).fillna(0)
y_train = train_df[12]['craving']
X_test = test_df[12].drop(test_df[12].columns[range(0, 61)], axis=1).fillna(0)
y_test = test_df[12]['craving']

params = [
    {'weights': ['uniform', 'distance'],
     'n_neighbors': np.arange(1, 30, 1)}]

clf = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=params, scoring='r2', cv=5)
clf.fit(X_train, y_train)
best_params = clf.best_params_
print(best_params)

y_predicted_test = clf.predict(X_test)

print('R_squared:', metrics.r2_score(y_test, y_predicted_test))
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predicted_test))
print('RMSE:', metrics.mean_squared_error(y_test, y_predicted_test, squared=False))

{'n_neighbors': 15, 'weights': 'uniform'}
R_squared: 0.13873596353952167
MAPE: 1.360770936631153
RMSE: 12.051167916353508


### 1.5 XGBoost Regression

In [568]:
X_train = standardize(X_train).fillna(0)
X_test = standardize(X_test).fillna(0)

# Very simple models work better here
params = [
    {'objective': ['reg:squarederror'],
     'n_estimators': np.arange(1, 7, 1),
     'eval_metric': ['rmse'],
     'max_depth': np.arange(1, 5, 1)}]

reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, n_jobs=5, cv=5, scoring='r2')
reg_xgb.fit(X_train, y_train)

y_predicted_test = reg_xgb.predict(X_test)

print('R_squared:', metrics.r2_score(y_test, y_predicted_test))
print('MAPE:', metrics.mean_absolute_percentage_error(y_test, y_predicted_test))
print('RMSE:', metrics.mean_squared_error(y_test, y_predicted_test, squared=False))

R_squared: 0.5803051003774975
MAPE: 1.0654934881024856
RMSE: 8.412553265735943


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


### 1.6 LSTM RNN

### 1.7 MTGNN

# 2. Nomothetic Models Regression