# 1-step Forecasting with linear and non-linear models

In [164]:
import pandas as pd
import numpy as np
% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import train_test_split
from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsRegressor
import sklearn.metrics as metrics

from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic
from statsmodels.tsa.vector_ar.vecm import coint_johansen

import load_data

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [165]:
train_df, test_df, data_raw_list = load_data.load_alcohol()

combined_data = []

for i in range(len(train_df)):
    train = train_df[i]
    test = test_df[i]
    # Combine both train and test sets since the initial split was 50/50
    combined = pd.concat([train, test])
    # Sort by date
    combined['start'] = pd.to_datetime(combined['start'])
    combined = combined.sort_values(by='start')
    combined_data.append(combined)

# Dataset with all individual's data
global_data = pd.concat(combined_data, ignore_index=True)
combined_data[0].head()

Unnamed: 0.1,Unnamed: 0,ID,start,finish,drinks,comfortable,stressed,down,calm,pressure,...,cosT.1,sinT.1,cos2T.1,sin2T.1,cosW.1,sinW.1,dayvar.1,beepvar.1,filter.1,consec.1
0,1,1,2018-02-06 16:20:00,2/6/2018 16:22,3,7.382609,-9.817391,10.843478,-37.791304,6.173913,...,1.0,0.0,1.0,0.0,1.0,0.0,1,4,0,1
31,2,1,2018-02-06 18:54:00,2/6/2018 18:58,0,14.382609,47.182609,7.843478,7.208696,10.173913,...,0.892979,0.450098,0.594823,0.803857,0.997777,0.066647,1,5,0,2
1,3,1,2018-02-06 20:08:00,2/6/2018 20:22,0,15.382609,12.182609,10.843478,20.208696,18.173913,...,0.41866,0.908143,-0.649448,0.760406,0.986795,0.161973,1,6,0,3
2,4,1,2018-02-06 22:29:00,2/6/2018 22:46,0,21.382609,-5.817391,-2.156522,8.208696,5.173913,...,0.108867,0.994056,-0.976296,0.21644,0.978277,0.207302,1,7,0,4
36,5,1,2018-02-07 10:52:00,2/7/2018 11:23,0,-11.617391,5.182609,0.843478,-24.791304,-4.826087,...,0.043619,-0.999048,-0.996195,-0.087156,0.77793,0.628351,2,1,0,7


## 1. Idiographic Models Regression

In [253]:
# Predict craving

# Collect train and test sets as was done in the paper
def prepare_date(idx, train_list, test_list):
    print('Patient ID:', train_list[idx]['ID'][0])
    # print('Test Patient ID:', test_list[idx]['ID'][0])

    X_train = train_list[idx].drop(train_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_train = train_list[idx]['craving']
    X_test = test_list[idx].drop(test_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_test = test_list[idx]['craving']

    return X_train, y_train, X_test, y_test


def standardize(data):
    local = data.copy()
    for col in local.columns:
        local[col] = (local[col] - local[col].mean()) / np.std(local[col])
    return local


def eval_results(actual, predicted, show):
    r2 = metrics.r2_score(actual, predicted)
    rmse = metrics.mean_squared_error(actual, predicted, squared=False)
    mape = metrics.mean_absolute_percentage_error(actual, predicted)

    if show:
        print('R_squared:', r2)
        print('MAPE:', mape)
        print('RMSE:', rmse)
        print('MAE:', metrics.mean_absolute_error(actual, predicted))
        print('CORR:', np.corrcoef(predicted, actual)[0, 1])

    return r2, rmse, mape


### 1.1 Lasso Regression

In [460]:
X_train, y_train, X_test, y_test = prepare_date(0, train_df, test_df)


def lasso_reg(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)
    alphas = np.arange(0.01, 20, 0.05)
    lasso = lm.LassoCV(alphas=alphas, cv=5, max_iter=10000, fit_intercept=False)
    lasso.fit(X_train_loc, train_y)
    y_predicted_test = lasso.predict(X_test_loc)

    # print('--- Lasso Regression Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


lasso_reg(X_train, y_train, X_test, y_test, True)

Patient ID: 1
R_squared: 0.08378345772949569
MAPE: 0.8964263747955392
RMSE: 4.3865742838554125
MAE: 3.447683405844313
CORR: 0.2964645120648325


(0.08378345772949569, 4.3865742838554125, 0.8964263747955392)

### 1.2 Elastic-Net Regression

In [461]:
def elastic_net(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.01, 0.6, 0.05)
    elastic_reg = lm.ElasticNetCV(alphas=np.arange(0.01, 20, 0.05), l1_ratio=l1_ratios, cv=5, max_iter=10000, fit_intercept=False)
    elastic_reg.fit(X_train_loc, train_y)
    y_predicted_test = elastic_reg.predict(X_test_loc)

    # print('--- Elastic-Net Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


elastic_net(X_train, y_train, X_test, y_test, True)

R_squared: 0.17152305547287872
MAPE: 0.932363120302897
RMSE: 4.171254012619212
MAE: 3.3614096342959017
CORR: 0.4655126120635689


(0.17152305547287872, 4.171254012619212, 0.932363120302897)

### 1.3 Linear SVM Regression

In [462]:
def linear_svm(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [False],
         'max_iter': [10000]}]

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='r2', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)
    y_predicted_test = clf.predict(X_test_loc)

    # print('--- Linear-SVM Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


linear_svm(X_train, y_train, X_test, y_test, True)

R_squared: 0.23822602197115716
MAPE: 0.8526261923449754
RMSE: 3.999811185025497
MAE: 2.8697093281015547
CORR: 0.49077040280451634


(0.23822602197115716, 3.999811185025497, 0.8526261923449754)

### 1.4 K-NN Regression

In [463]:
def knn_reg(train_x, train_y, test_x, test_y, vis):
    params = [
        {'weights': ['uniform', 'distance'],
         'n_neighbors': np.arange(2, 20, 1)}]

    clf = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=params, scoring='r2', cv=2)
    clf.fit(train_x, train_y)
    # best_params = clf.best_params_
    # print(best_params)

    y_predicted_test = clf.predict(test_x)

    # print('--- kNN Regression Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


knn_reg(X_train, y_train, X_test, y_test, True)

R_squared: 0.032537216911325384
MAPE: 1.0001027346321238
RMSE: 4.507581186392502
MAE: 3.669051878354204
CORR: 0.19411888382136314


(0.032537216911325384, 4.507581186392502, 1.0001027346321238)

### 1.5 XGBoost Regression

In [464]:
def xgboost_reg(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    # Very simple models work better here, since there are few datapoints
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': np.arange(1, 7, 1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 5, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, n_jobs=5, cv=2, scoring='r2')
    reg_xgb.fit(X_train_loc, train_y)

    y_predicted_test = reg_xgb.predict(X_test_loc)

    # print('--- XGBoost Regression Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


xgboost_reg(X_train, y_train, X_test, y_test, True)

R_squared: 0.13915050245135252
MAPE: 0.8922669549234424
RMSE: 4.2519685059590255
MAE: 3.344821732152717
CORR: 0.38585364849792797


(0.13915050245135252, 4.2519685059590255, 0.8922669549234424)

### 1.6 LSTM RNN

### 1.7 MTGNN

### 1.8 Evaluating Performance on Entire Dataset

In [466]:
import warnings


def average_metrics(r2_list, rmse_list, mape_list):
    print('Average R_Squared:', np.mean(r2_list))
    print('Average RMSE:', np.mean(rmse_list))
    print('Average MAPE:', np.mean(mape_list))


def evaluate_models(train_list, test_list):
    assert len(train_list) == len(test_list)
    r2_lasso, r2_elastic, r2_svm, r2_knn, r2_xgb, r2_lstm, r2_mtgnn = ([] for _ in range(7))
    rmse_lasso, rmse_elastic, rmse_svm, rmse_knn, rmse_xgb, rmse_lstm, rmse_mtgnn = ([] for _ in range(7))
    mape_lasso, mape_elastic, mape_svm, mape_knn, mape_xgb, mape_lstm, mape_mtgnn = ([] for _ in range(7))

    patient_ids = []

    for x in range(len(train_list)):
        # Build and evaluate a model for every single patient
        train_x, train_y, test_x, test_y = prepare_date(x, train_list, test_list)
        # Elastic-Net (baseline)
        r2, rmse, mape = elastic_net(train_x, train_y, test_x, test_y,
                                     False)  # only continue with other models if this one can get a positive r2

        # Elastic-Net metrics
        if r2 > 0:
            patient_ids.append(train_list[x]['ID'][0])
            r2_elastic.append(r2)
            rmse_elastic.append(rmse)
            mape_elastic.append(mape)

            # Lasso Regression
            r2, rmse, mape = lasso_reg(train_x, train_y, test_x, test_y, False)
            # Lasso metrics
            r2_lasso.append(r2)
            rmse_lasso.append(rmse)
            mape_lasso.append(mape)

            # Linear-SVM
            r2, rmse, mape = linear_svm(train_x, train_y, test_x, test_y, False)
            # Linear-SVM metrics
            r2_svm.append(r2)
            rmse_svm.append(rmse)
            mape_svm.append(mape)

            # kNN Regression
            r2, rmse, mape = knn_reg(train_x, train_y, test_x, test_y, False)
            # kNN metrics
            r2_knn.append(r2)
            rmse_knn.append(rmse)
            mape_knn.append(mape)

            # XGBoost Regression
            r2, rmse, mape = xgboost_reg(train_x, train_y, test_x, test_y, False)
            # XGBoost metrics
            r2_xgb.append(r2)
            rmse_xgb.append(rmse)
            mape_xgb.append(mape)

    print('---- Lasso Regression Results ----')
    average_metrics(r2_lasso, rmse_lasso, mape_lasso)
    print('---------------------------------')
    print('---- Elastic-Net Results ----')
    average_metrics(r2_elastic, rmse_elastic, mape_elastic)
    print('---------------------------------')
    print('---- Linear SVM Results ----')
    average_metrics(r2_svm, rmse_svm, mape_svm)
    print('---------------------------------')
    print('---- kNN Regression Results ----')
    average_metrics(r2_knn, rmse_knn, mape_knn)
    print('---------------------------------')
    print('---- XGBoost Results ----')
    average_metrics(r2_xgb, rmse_xgb, mape_xgb)
    print('---------------------------------')

    print('Included patient list:')
    print(patient_ids)

warnings.filterwarnings("ignore")
evaluate_models(train_df, test_df)

Patient ID: 1
Patient ID: 2
Patient ID: 3
Patient ID: 4
Patient ID: 5
Patient ID: 6
Patient ID: 7
Patient ID: 8
Patient ID: 9
Patient ID: 10
Patient ID: 12
Patient ID: 14
Patient ID: 15
Patient ID: 16
Patient ID: 17
Patient ID: 18
Patient ID: 19
Patient ID: 20
Patient ID: 21
Patient ID: 22
Patient ID: 23
Patient ID: 24
Patient ID: 25
Patient ID: 26
Patient ID: 27
Patient ID: 29
Patient ID: 30
Patient ID: 32
---- Lasso Regression Results ----
Average R_Squared: 0.2075392272738759
Average RMSE: 14.969460455237423
Average MAPE: 1.5095570442721984
---------------------------------
---- Elastic-Net Results ----
Average R_Squared: 0.24216518110050012
Average RMSE: 14.73040770286288
Average MAPE: 1.3175044015048376
---------------------------------
---- Linear SVM Results ----
Average R_Squared: 0.24299674800066068
Average RMSE: 14.659979317928459
Average MAPE: 1.2673309668873083
---------------------------------
---- kNN Regression Results ----
Average R_Squared: 0.10690724123244325
Average 

# 2. Nomothetic Models Regression