# Notebook for testing Scikit Learn models

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.4f' % x)
import seaborn as sns
sns.set_context("paper", font_scale=1.3)
sns.set_style('white')
import warnings
warnings.filterwarnings('ignore')
import datetime
import matplotlib.ticker as tkr
import math
import keras
import os.path
import sklearn as sk
import seaborn as sns
from scipy import stats
from statsmodels.tsa.stattools import adfuller
from sklearn import preprocessing
from statsmodels.tsa.stattools import pacf
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import label_binarize
from sklearn.metrics import multilabel_confusion_matrix, max_error
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, confusion_matrix, recall_score 
from sklearn.metrics import f1_score, auc, matthews_corrcoef, mean_absolute_error, mean_squared_error
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectPercentile, f_regression, mutual_info_regression, SelectKBest


from os import path
from time import time
from IPython.display import display
from IPython.display import SVG
from sklearn.model_selection import train_test_split

from numpy.random import seed
seed(20)
import tensorflow as tf
from keras import backend as K
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from scipy.stats import pearsonr
import lightgbm as lgb

from sklearn.linear_model import RidgeCV, LogisticRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor


from sklearn.ensemble import ExtraTreesRegressor

from sklearn.svm import SVR
from math import sqrt
from sklearn.metrics import r2_score, max_error

from sklearn.svm import LinearSVR

from sklearn.feature_selection import SelectFromModel
    
from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
import xgboost as xgb
from sklearn.metrics.scorer import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, TimeSeriesSplit

pd.set_option('display.max_columns', None)
np.set_printoptions(suppress=True)

## Helper Methods (Function Definition)

In [None]:
import logging

class Logger:
    def __init__(self, filename):
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        
        file_handler = logging.FileHandler(filename)
        file_handler.setLevel(logging.INFO)
        
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        
        self.logger.addHandler(file_handler)

    def log(self, message):
        self.logger.info(message)

import sys

def spcc(y_true, y_pred, **kwargs):
    corr, _ = pearsonr(y_true, y_pred)
    return corr
    
    
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
    #dataset = dataset.shuffle(shuffle_buffer).map(lambda window: (window[:-1], window[-1]))
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset
    
#Takes a dataframe with the holiday field and returns encoded dataframe.
def onehotholiday(select):
    X_2 = select[['Holiday']]
    # TODO: create a OneHotEncoder object, and fit it to all of X
    # 1. INSTANTIATE
    enc = preprocessing.OneHotEncoder(sparse=False)
    
    # 2. FIT
    enc.fit(X_2)
    
    # 3. Transform
    onehotlabels = enc.transform(X_2)
    # creating a list of column names 
    column_values = []
    for i in range(np.shape(onehotlabels)[1]):
            column_values.append('A'+str(i))

    # creating the dataframe 
    onehotholiday = pd.DataFrame(data = onehotlabels,columns = column_values)

    dataset = select.drop(columns=['Holiday'])
    dataset = select.join(onehotholiday)
    df1 = dataset.pop('2to5')
    dataset['2to5']=df1 # add b series as a 'new' column
    dataset2=dataset
    dataset2 = dataset2.drop(columns=['Holiday'])
    return dataset2
    

def add_lookback(dataset, look_back, df):
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back)]['2to5']
        a = a.values
        for j in range(len(a)):
            df[j][i]= a[j]
    return df

## Initialization of Tested Models

In [None]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neural_network import MLPRegressor
nn = MultiOutputRegressor( MLPRegressor(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 5, 5, 5, 5), random_state=42) )
stack_nn =  MLPRegressor(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 5, 5, 5, 5), random_state=42)
from sklearn.neighbors import KNeighborsRegressor
neigh = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=7))
stack_neigh = KNeighborsRegressor(n_neighbors=7)

from mlxtend.regressor import StackingCVRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
LR = LinearRegression()
rid = Ridge(alpha=1.0)
et = ExtraTreesRegressor(n_estimators=1000, random_state=0)

stack = MultiOutputRegressor(StackingCVRegressor(regressors=(LR, rid, et, stack_neigh, stack_nn),
                            meta_regressor=rid, 
                            random_state=42,
                            use_features_in_secondary=True))




lgbm = MultiOutputRegressor(lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=1000,n_jobs=-1))

XGBModel = MultiOutputRegressor(XGBRegressor(learning_rate =0.05,
 n_estimators=2000,
 max_depth=5,
 min_child_weight=1,
 gamma=0.1,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:squarederror',
 nthread=5,
 scale_pos_weight=1,
 seed=21,
 eval_metric = ['mae']))

ETCModel  = MultiOutputRegressor(ExtraTreesRegressor(n_estimators=1000, random_state=0))

from sklearn.gaussian_process import GaussianProcessRegressor
gpr = MultiOutputRegressor(GaussianProcessRegressor())

from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LassoCV
Lcv = MultiOutputRegressor(LassoCV(cv=5, random_state=0))
LR = MultiOutputRegressor(LinearRegression())
SGD = MultiOutputRegressor(SGDRegressor(max_iter=1000, tol=1e-3))



from sklearn.linear_model import Ridge
ridge = MultiOutputRegressor(Ridge(alpha=1.0))

from sklearn.linear_model import ElasticNetCV
els = MultiOutputRegressor(ElasticNetCV(cv=10, random_state=0,max_iter=100000) )  

from sklearn.tree import DecisionTreeRegressor   
dt = MultiOutputRegressor(DecisionTreeRegressor(random_state=0,criterion="mae") )

from sklearn.kernel_ridge import KernelRidge
kridge = MultiOutputRegressor(KernelRidge())

from sklearn.linear_model import BayesianRidge
bridge = MultiOutputRegressor(BayesianRidge())

from sklearn.svm import SVR, NuSVR, LinearSVR
svr = MultiOutputRegressor(SVR())
nusvr = MultiOutputRegressor(NuSVR())
linsvr = MultiOutputRegressor(LinearSVR())

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import SGDRegressor

reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10)
reg2 = RandomForestRegressor(random_state=1, n_estimators=10)
reg3 = LinearRegression()
reg4 = DecisionTreeRegressor(random_state=0,criterion="mae")
reg5 = SGDRegressor(max_iter=1000, tol=1e-3)

VR = MultiOutputRegressor(
    VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3), ('dt',reg4), ('sgd',reg5)]))

from sklearn.neural_network import MLPRegressor
nn = MultiOutputRegressor( MLPRegressor(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 5, 5, 5, 5), random_state=42) )

tss = TimeSeriesSplit(n_splits = 10)

## Actual Test

In [None]:
np.random.seed(42)
# fix random seed for reproducibility
# load the dataset
dataframe = pd.read_csv('Data/RestaurantDataVets_All_2to5.csv')
data = dataframe.drop(columns=['Index','Group','DMY','MissingPrevDays','DailyAvg','DailyBusyness'])

lookback=20
dataframe_removed_lookback = data.drop([x for x in range(lookback)])

for i in range(lookback):
    dataframe_removed_lookback[i] = 1.0
    
df = dataframe_removed_lookback[['Year', 'Day', 'January','February',
                         'March','April','May','June','July',
                         'August', 'September', 'October', 'November',
                         'December','Sunday', 'Monday', 'Tuesday',
                         'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Holiday', 'Carnival', 
                         'LentFasting','Ramadan','ChristmasSeason',
                         'WeeklyAvg','MinSales','MaxSales',
                         'WeeklyBusyness',
                         0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                          14, 15, 16, 17, 18, 19,      
                         '2to5']]
df = df.reset_index(drop=True)

#Objects need to be converted to float due to missing values at load time.
#df["DailyAvg"] = df.DailyAvg.astype(float)
df["WeeklyAvg"] = df.WeeklyAvg.astype(float)
df["MinSales"] = df.MinSales.astype(float)
df["MaxSales"] = df.MaxSales.astype(float)
#df["DailyBusyness"] = df.DailyBusyness.astype(float)
df["WeeklyBusyness"] = df.WeeklyBusyness.astype(float)

lb_data = add_lookback(data, lookback, df)
lb_data = lb_data.reset_index(drop=True)

hotdata = onehotholiday(lb_data)
hotdata = hotdata.drop(columns=[14,15,16,17,18,19])

hot_numcols = len(hotdata.columns)
dataset = hotdata.values

lbset=lb_data.values
lb_numcols =  len(lb_data.columns)

print("train_df Shape:" ,lb_data.shape)
print("After encoding:", hotdata.shape)

X=dataset[:, 0:hot_numcols-1]
y=lbset[:, lb_numcols-7:lb_numcols]

scaler = preprocessing.RobustScaler()
X = scaler.fit_transform(X,y)

sys.stdout.flush()

clflist=[stack,LR,VR,linsvr,SGD , nn, svr, nusvr, bridge, dt, els,LR,ridge,kridge, 
         Lcv,  gpr, ETCModel, neigh, XGBModel, lgbm]  
#,
recording = pd.DataFrame(clflist,columns=['Model'])

scoring = {
        'mae': 'neg_mean_absolute_error',
        'mse' : 'neg_mean_squared_error',
        'custom': make_scorer(spcc, greater_is_better=True)
    }

j_arr = []
for j in range (71):
    print('Keeping k-best features where k = ', (j+1))
    mae_arr = []
    max_arr = []
    rmse_arr = []
    corr_arr = []

    for x in range(len(clflist)):
        corr_arr.append(0)
        mae_arr.append(0)
        rmse_arr.append(0)
        max_arr.append(0)
    i=0
    for clf in clflist:
            feat_reduction = SelectKBest(f_regression, k=j+1) 
            X_new = feat_reduction.fit_transform(X,y[:,0])
            print(clf) 
            X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=221, random_state=42, shuffle=False)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            
            meansq = keras.metrics.mean_squared_error(y_test.flatten(), y_pred.flatten()).numpy()
            meanabs = keras.metrics.mean_absolute_error(y_test.flatten(), y_pred.flatten()).numpy()
            print("MSE = "+str(meansq))
            print("MAE = "+str(meanabs))
            mae_arr[i]=meanabs
            i= i+1
            print('_______________________________________________________________________________')
    
    j_arr.append(j+1)
    recording['Run'+str(j)] = mae_arr

corr_arr = np.array(corr_arr)
print("COMPLETE")