# 1-step Forecasting with linear and non-linear models

In [43]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsRegressor
import sklearn.metrics as metrics

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.vector_ar.vecm import coint_johansen

import load_data

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [44]:
train_df, test_df, data_raw_list = load_data.load_alcohol()

combined_data = []

for i in range(len(train_df)):
    train = train_df[i]
    test = test_df[i]
    # Combine both train and test sets since the initial split was 50/50
    combined = pd.concat([train, test])
    # Sort by date
    combined['start'] = pd.to_datetime(combined['start'])
    combined = combined.sort_values(by='start')
    combined_data.append(combined)

# Dataset with all individual's data
global_data = pd.concat(combined_data, ignore_index=True)
combined_data[0].head()

Unnamed: 0.1,Unnamed: 0,ID,start,finish,drinks,comfortable,stressed,down,calm,pressure,...,cosT.1,sinT.1,cos2T.1,sin2T.1,cosW.1,sinW.1,dayvar.1,beepvar.1,filter.1,consec.1
0,1,1,2018-02-06 16:20:00,2/6/2018 16:22,3,7.382609,-9.817391,10.843478,-37.791304,6.173913,...,1.0,0.0,1.0,0.0,1.0,0.0,1,4,0,1
31,2,1,2018-02-06 18:54:00,2/6/2018 18:58,0,14.382609,47.182609,7.843478,7.208696,10.173913,...,0.892979,0.450098,0.594823,0.803857,0.997777,0.066647,1,5,0,2
1,3,1,2018-02-06 20:08:00,2/6/2018 20:22,0,15.382609,12.182609,10.843478,20.208696,18.173913,...,0.41866,0.908143,-0.649448,0.760406,0.986795,0.161973,1,6,0,3
2,4,1,2018-02-06 22:29:00,2/6/2018 22:46,0,21.382609,-5.817391,-2.156522,8.208696,5.173913,...,0.108867,0.994056,-0.976296,0.21644,0.978277,0.207302,1,7,0,4
36,5,1,2018-02-07 10:52:00,2/7/2018 11:23,0,-11.617391,5.182609,0.843478,-24.791304,-4.826087,...,0.043619,-0.999048,-0.996195,-0.087156,0.77793,0.628351,2,1,0,7


## 1. Idiographic Models Regression

In [45]:
# Predict craving

# Collect train and test sets as was done in the paper
def prepare_date(idx, train_list, test_list):
    print('Train Patient ID:', train_list[idx]['ID'][0])
    print('Test Patient ID:', test_list[idx]['ID'][0])
    X_train = train_list[idx].drop(train_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_train = train_list[idx]['craving']
    X_test = test_list[idx].drop(test_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_test = test_list[idx]['craving']

    return X_train, y_train, X_test, y_test


def standardize(data):
    local = data.copy()
    for col in local.columns:
        local[col] = (local[col] - local[col].mean()) / np.std(local[col])
    return local


def eval_results(actual, predicted, show):
    r2 = metrics.r2_score(actual, predicted)
    rmse = metrics.mean_squared_error(actual, predicted, squared=False)
    mape = metrics.mean_absolute_percentage_error(actual, predicted)

    if show:
        print('R_squared:', r2)
        print('MAPE:', mape)
        print('RMSE:', rmse)
        print('MAE:', metrics.mean_absolute_error(actual, predicted))
        print('CORR:', np.corrcoef(predicted, actual)[0, 1])

    return r2, rmse, mape


### 1.1 Lasso Regression

In [46]:
X_train, y_train, X_test, y_test = prepare_date(12, train_df, test_df)


def lasso_reg(train_x, train_y, test_x, test_y):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    alphas = np.arange(0.1, 200, .1)
    lasso = lm.LassoCV(alphas=alphas, cv=5, max_iter=10000)
    lasso.fit(X_train_loc, train_y)
    y_predicted_test = lasso.predict(X_test_loc)

    print('--- Lasso Regression Results ---')
    print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=True)
    return r2, rmse, mape


lasso_reg(X_train, y_train, X_test, y_test)

Train Patient ID: 14
Test Patient ID: 14
--- Lasso Regression Results ---

R_squared: 0.5821437826058486
MAPE: 1.17437925717085
RMSE: 8.39410535185786
MAE: 5.901087684668049
CORR: 0.7894358275879657


(0.5821437826058486, 8.39410535185786, 1.17437925717085)

### 1.2 Elastic-Net Regression

In [47]:
def elastic_net(train_x, train_y, test_x, test_y):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.5, 1, 0.05)
    alphas = np.arange(0.1, 100, .1)
    elastic_reg = lm.ElasticNetCV(alphas=alphas, cv=5, l1_ratio=l1_ratios, max_iter=10000)
    elastic_reg.fit(X_train_loc, train_y)

    y_predicted_test = elastic_reg.predict(X_test_loc)

    print('--- Elastic-Net Results ---')
    print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=True)
    return r2, rmse, mape


elastic_net(X_train, y_train, X_test, y_test)

--- Elastic-Net Results ---

R_squared: 0.6330224747427082
MAPE: 0.9195012167791153
RMSE: 7.866484932707291
MAE: 5.6230138404407795
CORR: 0.8633360981352912


(0.6330224747427082, 7.866484932707291, 0.9195012167791153)

### 1.3 Linear SVM Regression

In [48]:
def linear_svm(train_x, train_y, test_x, test_y):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [False],
         'max_iter': [10000]}]

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='r2', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)

    y_predicted_test = clf.predict(X_test_loc)

    print('--- Linear-SVM Results ---')
    print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=True)
    return r2, rmse, mape


linear_svm(X_train, y_train, X_test, y_test)

--- Linear-SVM Results ---

R_squared: 0.5662665273881238
MAPE: 0.9496486778057168
RMSE: 8.552093709681719
MAE: 6.055885224518501
CORR: 0.8672454177016723


(0.5662665273881238, 8.552093709681719, 0.9496486778057168)

### 1.4 K-NN Regression

In [49]:
def knn_reg(train_x, train_y, test_x, test_y):
    params = [
        {'weights': ['uniform', 'distance'],
         'n_neighbors': np.arange(1, 30, 1)}]

    clf = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=params, scoring='r2', cv=5)
    clf.fit(train_x, train_y)
    # best_params = clf.best_params_
    # print(best_params)

    y_predicted_test = clf.predict(test_x)

    print('--- kNN Regression Results ---')
    print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=True)
    return r2, rmse, mape


knn_reg(X_train, y_train, X_test, y_test)

--- kNN Regression Results ---

R_squared: 0.13873596353952167
MAPE: 1.360770936631153
RMSE: 12.051167916353508
MAE: 7.8125
CORR: 0.4014409106571804


(0.13873596353952167, 12.051167916353508, 1.360770936631153)

### 1.5 XGBoost Regression

In [50]:
def xgboost_reg(train_x, train_y, test_x, test_y):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    # Very simple models work better here, since there are few datapoints
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': np.arange(1, 7, 1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 5, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, n_jobs=5, cv=5, scoring='r2')
    reg_xgb.fit(X_train_loc, train_y)

    y_predicted_test = reg_xgb.predict(X_test_loc)

    print('--- XGBoost Regression Results ---')
    print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=True)
    return r2, rmse, mape


xgboost_reg(X_train, y_train, X_test, y_test)

--- XGBoost Regression Results ---

R_squared: 0.5803051003774975
MAPE: 1.0654934881024856
RMSE: 8.412553265735943
MAE: 5.399465526212279
CORR: 0.8261722878962449


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


(0.5803051003774975, 8.412553265735943, 1.0654934881024856)

### 1.6 LSTM RNN

### 1.7 MTGNN

### 1.8 Evaluating Performance on Entire Dataset

# 2. Nomothetic Models Regression