# 1-step Forecasting with linear and non-linear models

In [259]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

import load_data

# Plot settings
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['figure.dpi'] = 150
sns.set()

In [260]:
train_df, test_df, data_raw_list = load_data.load_alcohol()

combined_data = []

for i in range(len(train_df)):
    train = train_df[i]
    test = test_df[i]
    # Combine both train and test sets since the initial split was 50/50
    combined = pd.concat([train, test])
    # Sort by date
    combined['start'] = pd.to_datetime(combined['start'])
    combined = combined.sort_values(by='start')
    combined_data.append(combined)

combined_data[0].head()

Unnamed: 0.1,Unnamed: 0,ID,start,finish,drinks,comfortable,stressed,down,calm,pressure,...,cosT.1,sinT.1,cos2T.1,sin2T.1,cosW.1,sinW.1,dayvar.1,beepvar.1,filter.1,consec.1
0,1,1,2018-02-06 16:20:00,2/6/2018 16:22,3,7.382609,-9.817391,10.843478,-37.791304,6.173913,...,1.0,0.0,1.0,0.0,1.0,0.0,1,4,0,1
31,2,1,2018-02-06 18:54:00,2/6/2018 18:58,0,14.382609,47.182609,7.843478,7.208696,10.173913,...,0.892979,0.450098,0.594823,0.803857,0.997777,0.066647,1,5,0,2
1,3,1,2018-02-06 20:08:00,2/6/2018 20:22,0,15.382609,12.182609,10.843478,20.208696,18.173913,...,0.41866,0.908143,-0.649448,0.760406,0.986795,0.161973,1,6,0,3
2,4,1,2018-02-06 22:29:00,2/6/2018 22:46,0,21.382609,-5.817391,-2.156522,8.208696,5.173913,...,0.108867,0.994056,-0.976296,0.21644,0.978277,0.207302,1,7,0,4
36,5,1,2018-02-07 10:52:00,2/7/2018 11:23,0,-11.617391,5.182609,0.843478,-24.791304,-4.826087,...,0.043619,-0.999048,-0.996195,-0.087156,0.77793,0.628351,2,1,0,7


In [261]:
# Reading in data and pre-processing
train_df, test_df, data_raw_list = load_data.load_alcohol()

for i in range(len(data_raw_list)):
    data_raw_list[i] = load_data.prepare_data_alcohol(data_raw_list[i])
    data_raw_list[i]['start'] = pd.to_datetime(data_raw_list[i]['start'])
    lag_one = data_raw_list[i].shift()
    lag_one = lag_one.add_suffix('_1')
    data_raw_list[i] = pd.concat([data_raw_list[i], lag_one], axis=1)
    data_raw_list[i] = data_raw_list[i][data_raw_list[i]['finish'].notna()]
    data_raw_list[i] = data_raw_list[i][data_raw_list[i]['finish_1'].notna()]
    data_raw_list[i] = data_raw_list[i][data_raw_list[i]['drinks'].notna()]
    data_raw_list[i] = data_raw_list[i][data_raw_list[i]['drinks_1'].notna()]
    data_raw_list[i] = data_raw_list[i][~(data_raw_list[i]['start'].dt.day != data_raw_list[i]['start_1'].dt.day)]

data_raw_list[0].head()

Unnamed: 0,start,finish,drinks,comfortable,stressed,down,calm,pressure,enthusiastic,happy,...,impulsive_1,pos_expect_1,peer_percent_1,want_drink_1,delay_grat_1,angry_1,drink_predict_1,restless_sleep_1,difficulty_sleep_1,hours_sleep_1
1,2018-02-06 16:20:00,2/6/2018 16:22,3.0,81.0,12.0,18.0,26.0,11.0,31.0,56.0,...,0.0,0.0,17.0,5.0,2.0,9.0,,,,
2,2018-02-06 18:54:00,2/6/2018 18:58,0.0,88.0,69.0,15.0,71.0,15.0,83.0,91.0,...,8.0,14.0,55.0,14.0,32.0,15.0,6.0,7.0,8.0,7.0
3,2018-02-06 20:08:00,2/6/2018 20:22,0.0,89.0,34.0,18.0,84.0,23.0,92.0,76.0,...,15.0,56.0,78.0,61.0,48.0,14.0,,,,
4,2018-02-06 22:29:00,2/6/2018 22:46,0.0,95.0,16.0,5.0,72.0,10.0,25.0,66.0,...,17.0,15.0,73.0,14.0,72.0,16.0,,,,
6,2018-02-07 10:52:00,2/7/2018 11:23,0.0,62.0,27.0,8.0,39.0,0.0,66.0,58.0,...,3.0,0.0,4.0,0.0,78.0,7.0,0.0,5.0,18.0,7.0


## 1. Idiographic Models Regression

In [262]:
# Predict craving

# Make own splits
def prepare_data_own(idx, combined_list, random_state):
    #print('Patient ID:', combined_list[idx].iloc[0]['ID'])
    X = combined_list[idx].drop(combined_list[idx].columns[range(0, 24)], axis=1).fillna(0)
    y = combined_list[idx]['craving']

    return train_test_split(X, y, test_size=0.3, random_state=random_state)


def prepare_data(idx, train_list, test_list):
    print('Patient ID:', train_list[idx]['ID'][0])

    X_train = train_list[idx].drop(train_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_train = train_list[idx]['craving']
    X_test = test_list[idx].drop(test_list[idx].columns[range(0, 61)], axis=1).fillna(0)
    y_test = test_list[idx]['craving']

    return X_train, X_test, y_train, y_test


def standardize(data):
    local = data.copy()
    for col in local.columns:
        local[col] = (local[col] - local[col].mean()) / np.std(local[col])
    return local


def eval_results(actual, predicted, show):
    r2 = metrics.r2_score(actual, predicted)
    rmse = metrics.mean_squared_error(actual, predicted, squared=False)
    mape = metrics.mean_absolute_percentage_error(actual, predicted)

    if show:
        print('R_squared:', r2)
        print('MAPE:', mape)
        print('RMSE:', rmse)
        print('MAE:', metrics.mean_absolute_error(actual, predicted))
        print('CORR:', np.corrcoef(predicted, actual)[0, 1])

    return r2, rmse, mape

### 1.1 Lasso Regression

In [263]:
X_train, X_test, y_train, y_test = prepare_data(1, train_list=train_df, test_list=test_df)


def lasso_reg(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    alphas = np.arange(0.01, 20, 0.05)
    lasso = lm.LassoCV(alphas=alphas, cv=5, max_iter=100000, fit_intercept=True)
    lasso.fit(X_train_loc, train_y)
    y_predicted_test = lasso.predict(X_test_loc)

    # print('--- Lasso Regression Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)

    return r2, rmse, mape


lasso_reg(X_train, y_train, X_test, y_test, True)

Patient ID: 2
R_squared: 0.2336748819426997
MAPE: 0.8602057458801604
RMSE: 21.500863785473992
MAE: 16.922922828939296
CORR: 0.5125876082907408


(0.2336748819426997, 21.500863785473992, 0.8602057458801604)

### 1.2 Elastic-Net Regression

In [264]:
def elastic_net(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    l1_ratios = np.arange(0.01, 0.6, 0.05)
    elastic_reg = lm.ElasticNetCV(alphas=np.arange(0.01, 20, 0.05), l1_ratio=l1_ratios, cv=5, max_iter=100000,
                                  fit_intercept=True)
    elastic_reg.fit(X_train_loc, train_y)
    y_predicted_test = elastic_reg.predict(X_test_loc)

    # print('--- Elastic-Net Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


elastic_net(X_train, y_train, X_test, y_test, True)

R_squared: 0.3105248292112299
MAPE: 0.868046768759073
RMSE: 20.394294809977133
MAE: 16.71060148405697
CORR: 0.5610027960444378


(0.3105248292112299, 20.394294809977133, 0.868046768759073)

### 1.3 Linear SVM Regression

In [265]:
def linear_svm(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)

    params = [
        {'C': np.arange(0.1, 4, 0.1),
         'epsilon': np.arange(6, 7, 0.1),
         'loss': ['epsilon_insensitive'],
         'fit_intercept': [True],
         'max_iter': [10000]}]

    clf = GridSearchCV(estimator=LinearSVR(), param_grid=params, scoring='r2', cv=5)
    clf.fit(X_train_loc, train_y)
    # best_params = clf.best_params_
    # print(best_params)
    y_predicted_test = clf.predict(X_test_loc)

    # print('--- Linear-SVM Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


linear_svm(X_train, y_train, X_test, y_test, True)

R_squared: 0.3356438834226828
MAPE: 0.8317165121227911
RMSE: 20.01934418628923
MAE: 16.08854279048022
CORR: 0.5836003576596637


(0.3356438834226828, 20.01934418628923, 0.8317165121227911)

### 1.4 K-NN Regression

In [266]:
def knn_reg(train_x, train_y, test_x, test_y, vis):
    params = [
        {'weights': ['uniform', 'distance'],
         'n_neighbors': np.arange(2, 20, 1)}]

    clf = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=params, scoring='neg_mean_squared_error', cv=2)
    clf.fit(train_x, train_y)
    # best_params = clf.best_params_
    # print(best_params)

    y_predicted_test = clf.predict(test_x)

    # print('--- kNN Regression Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


knn_reg(X_train, y_train, X_test, y_test, True)

R_squared: 0.14247416686246706
MAPE: 1.0065771554001917
RMSE: 22.744321386225067
MAE: 17.094135426343126
CORR: 0.49368573617049394


(0.14247416686246706, 22.744321386225067, 1.0065771554001917)

### 1.5 Symbolic Regressions (Genetic Algorithm basically)

In [267]:
from gplearn.genetic import SymbolicRegressor

function_set = ['add', 'sub', 'mul', 'div', 'sin', 'log']
model = SymbolicRegressor(population_size=3000, tournament_size=5,
                          generations=10, stopping_criteria=0.1,
                          function_set=function_set, metric='rmse',
                          p_crossover=0.65, p_subtree_mutation=0.15,
                          p_hoist_mutation=0.05, p_point_mutation=0.1,
                          verbose=1, random_state=None, n_jobs=-1)
model.fit(X_train, y_train)
predicted = model.predict(X_test)

eval_results(actual=y_test, predicted=predicted, show=True)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    12.85          11005.5        3          20.6789              N/A     13.55s
   1     8.61          94.3492        5          18.0261              N/A     11.08s
   2     8.81          40.6437        3          17.9199              N/A     15.94s
   3     7.64          31.6061        3          17.9199              N/A     11.54s
   4     6.21          31.5061        7          16.4142              N/A      9.11s
   5     6.94          35.5869        5          16.1092              N/A      7.76s
   6    10.47          36.8222       12          15.3856              N/A      5.46s
   7    13.77          61.7009       13          15.0662              N/A      4.00s
   8    15.28          46.9933       13          15.0662              N/A  

(0.0022095356145428457, 24.53403903990671, 0.8716965965602719)

### 1.6 XGBoost Regression

In [268]:
def xgboost_reg(train_x, train_y, test_x, test_y, vis):
    # Very simple models work better here, since there are few datapoints
    params = [
        {'objective': ['reg:squarederror'],
         'n_estimators': np.arange(1, 7, 1),
         'eval_metric': ['rmse'],
         'max_depth': np.arange(1, 5, 1)}]

    reg_xgb = GridSearchCV(xgb.XGBRegressor(), params, n_jobs=5, cv=5, scoring='r2')
    reg_xgb.fit(train_x, train_y)

    y_predicted_test = reg_xgb.predict(test_x)

    # print('--- XGBoost Regression Results ---')
    # print()
    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


xgboost_reg(X_train, y_train, X_test, y_test, True)

R_squared: 0.24420213612928443
MAPE: 0.7930023802589711
RMSE: 21.352670947844835
MAE: 16.99780234873153
CORR: 0.5215126732652979


(0.24420213612928443, 21.352670947844835, 0.7930023802589711)

### 1.7 Random Forests

In [269]:
from sklearn.model_selection import RandomizedSearchCV


def random_forests(train_x, train_y, test_x, test_y, vis):
    grid = [
        {'n_estimators': [50, 70, 100],
         'max_features': ['auto', 'sqrt'],
         'max_depth': [5, 10, 15, 20],
         'min_samples_split': [2, 4, 6],
         'min_samples_leaf': [1],
         'bootstrap': [True]}]

    rf = GridSearchCV(RandomForestRegressor(), param_grid=grid, cv=5, scoring='r2')
    rf.fit(train_x, train_y)
    y_predicted_test = rf.predict(test_x)
    # print(rf.best_params_)

    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test, show=vis)
    return r2, rmse, mape


random_forests(X_train, y_train, X_test, y_test, True)

R_squared: 0.285144301605625
MAPE: 0.8360945732237236
RMSE: 20.766273480952577
MAE: 17.195510204081636
CORR: 0.5455414619322225


(0.285144301605625, 20.766273480952577, 0.8360945732237236)

### 1.8 LSTM RNN

In [270]:
import keras.layers
from keras.models import Sequential


def lstm_rnn(train_x, train_y, test_x, test_y, vis):
    X_train_loc = standardize(train_x).fillna(0)
    X_test_loc = standardize(test_x).fillna(0)
    train_x_val, train_y_val, test_x_val, test_y_val = X_train_loc.values, train_y.values, X_test_loc.values, test_y.values

    train_x_val = train_x_val.reshape((train_x_val.shape[0], 1, train_x_val.shape[1]))
    test_x_val = test_x_val.reshape((test_x_val.shape[0], 1, test_x_val.shape[1]))

    # print(train_x_val.shape)
    # print(test_x_val.shape)

    model = Sequential([
        keras.layers.LSTM(100, return_sequences=True, input_shape=(train_x_val.shape[1], train_x_val.shape[2])),
        keras.layers.Dropout(0.25),
        keras.layers.LSTM(units=50, return_sequences=True),
        keras.layers.Dropout(0.20),
        keras.layers.LSTM(units=10, return_sequences=False),
        keras.layers.Dense(units=1, activation='linear'),
    ])
    model.compile(loss='mae', optimizer='adam')
    model.fit(train_x_val, train_y_val, epochs=15, batch_size=4, verbose=0, shuffle=False)

    y_predicted_test = model.predict(test_x_val)

    r2, rmse, mape = eval_results(actual=test_y, predicted=y_predicted_test.flatten(), show=vis)
    return r2, rmse, mape


lstm_rnn(X_train, y_train, X_test, y_test, True)

R_squared: 0.07775874048200226
MAPE: 0.8882427120793301
RMSE: 23.586942665661006
MAE: 20.90448520245254
CORR: 0.5333149994152293


(0.07775874048200226, 23.586942665661006, 0.8882427120793301)

### 1.9 MTGNN

In [271]:
import torch
import torch.nn.functional as f
from torch_geometric_temporal.nn.recurrent.gconv_gru import GConvGRU


class RecurrentGCN(torch.nn.Module):
    def __init__(self, node_features, filters):
        super(RecurrentGCN, self).__init__()
        self.recurrent = GConvGRU(node_features, filters, 2)
        self.linear = torch.nn.Linear(filters, 1)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        h = f.relu(h)
        h = self.linear(h)
        return h

### 2. Evaluating Performance on Entire Dataset

In [272]:
import warnings


def average_metrics(r2_list, rmse_list, mape_list):
    print('Average R_Squared:', np.mean(r2_list))
    print('Average RMSE:', np.mean(rmse_list))
    print('Average MAPE:', np.mean(mape_list))


def evaluate_models(train_list, test_list):
    assert len(train_list) == len(test_list)
    r2_lasso, r2_elastic, r2_svm, r2_knn, r2_xgb, r2_rf, r2_lstm, r2_mtgnn = ([] for _ in range(8))
    rmse_lasso, rmse_elastic, rmse_svm, rmse_knn, rmse_xgb, rmse_rf, rmse_lstm, rmse_mtgnn = ([] for _ in range(8))
    mape_lasso, mape_elastic, mape_svm, mape_knn, mape_xgb, mape_rf, mape_lstm, mape_mtgnn = ([] for _ in range(8))

    patient_ids = []

    for x in range(len(train_list)):
        # Build and evaluate a model for every single patient
        train_x, test_x, train_y, test_y = prepare_data(x, train_list=train_list, test_list=test_list)
        # Elastic-Net (baseline)
        r2, rmse, mape = elastic_net(train_x, train_y, test_x, test_y,
                                     False)  # only continue with other models if this one can get a positive r2

        # Elastic-Net metrics
        patient_ids.append(train_list[x]['ID'][0])
        r2_elastic.append(max(0, r2))
        rmse_elastic.append(rmse)
        mape_elastic.append(mape)

        # Lasso Regression
        r2, rmse, mape = lasso_reg(train_x, train_y, test_x, test_y, False)
        # Lasso metrics
        r2_lasso.append(max(0, r2))
        rmse_lasso.append(rmse)
        mape_lasso.append(mape)

        # Linear-SVM
        r2, rmse, mape = linear_svm(train_x, train_y, test_x, test_y, False)
        # Linear-SVM metrics
        r2_svm.append(max(0, r2))
        rmse_svm.append(rmse)
        mape_svm.append(mape)

        # kNN Regression
        r2, rmse, mape = knn_reg(train_x, train_y, test_x, test_y, False)
        # kNN metrics
        r2_knn.append(max(0, r2))
        rmse_knn.append(rmse)
        mape_knn.append(mape)

        # XGBoost Regression
        r2, rmse, mape = xgboost_reg(train_x, train_y, test_x, test_y, False)
        # XGBoost metrics
        r2_xgb.append(max(0, r2))
        rmse_xgb.append(rmse)
        mape_xgb.append(mape)

        # RF
        r2, rmse, mape = random_forests(train_x, train_y, test_x, test_y, False)
        # RF metrics
        r2_rf.append(max(0, r2))
        rmse_rf.append(rmse)
        mape_rf.append(mape)

        # LSTM RNN
        r2, rmse, mape = lstm_rnn(train_x, train_y, test_x, test_y, False)
        # LSTM metrics
        r2_lstm.append(max(0, r2))
        rmse_lstm.append(rmse)
        mape_lstm.append(mape)

    print('---- Lasso Regression Results ----')
    average_metrics(r2_lasso, rmse_lasso, mape_lasso)
    print('---------------------------------')
    print('---- Elastic-Net Results ----')
    average_metrics(r2_elastic, rmse_elastic, mape_elastic)
    print('---------------------------------')
    print('---- Linear SVM Results ----')
    average_metrics(r2_svm, rmse_svm, mape_svm)
    print('---------------------------------')
    print('---- kNN Regression Results ----')
    average_metrics(r2_knn, rmse_knn, mape_knn)
    print('---------------------------------')
    print('---- XGBoost Results ----')
    average_metrics(r2_xgb, rmse_xgb, mape_xgb)
    print('---------------------------------')
    print('---- Random Forest Results ----')
    average_metrics(r2_rf, rmse_rf, mape_rf)
    print('---------------------------------')
    print('---- LSTM Results ----')
    average_metrics(r2_lstm, rmse_lstm, mape_lstm)
    print('---------------------------------')

    print('Included patient list:')
    print(patient_ids)


warnings.filterwarnings("ignore")
evaluate_models(train_df, test_df)

Patient ID: 1
Patient ID: 2
Patient ID: 3
Patient ID: 4
Patient ID: 5
Patient ID: 6
Patient ID: 7
Patient ID: 8
Patient ID: 9
Patient ID: 10
Patient ID: 12
Patient ID: 14
Patient ID: 15
Patient ID: 16
Patient ID: 17
Patient ID: 18
Patient ID: 19
Patient ID: 20
Patient ID: 21
Patient ID: 22
Patient ID: 23
Patient ID: 24
Patient ID: 25
Patient ID: 26
Patient ID: 27
Patient ID: 29
Patient ID: 30
Patient ID: 32
---- Lasso Regression Results ----
Average R_Squared: 0.1770519351340509
Average RMSE: 14.717961396686716
Average MAPE: 1.5472097567998122
---------------------------------
---- Elastic-Net Results ----
Average R_Squared: 0.194006559997254
Average RMSE: 14.771363649360774
Average MAPE: 1.4156218369947697
---------------------------------
---- Linear SVM Results ----
Average R_Squared: 0.1915070799352016
Average RMSE: 14.380320413866242
Average MAPE: 1.2752770423667938
---------------------------------
---- kNN Regression Results ----
Average R_Squared: 0.11997494708105814
Average RM

# 2. Nomothetic Models Regression

### In separate notebook