### **Import Libraries**

In [2]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter

import pycaret.time_series
from pycaret.regression import *
import xgboost

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
import sklearn

import json
import glob
import os

### **Load Data**

In [20]:
data = pd.read_csv('data/input/non_standardized_train.csv', index_col=0)
data_l = pd.read_csv('data/input/non_standardized_train_log_sales.csv', index_col=0)

data_s = pd.read_csv('data/input/standardized_train.csv', index_col=0)
data_sl = pd.read_csv('data/input/standardized_train_log_sales.csv', index_col=0)


### **Type 1: Not Standardized, Sales Unchanged**

In [65]:
from sklearn.model_selection import train_test_split, KFold
# train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

kf = KFold(n_splits=5)

setup_1 = setup(data = data, test_data = data, target = 'sales', 
                fold_strategy = kf, transform_target = False, preprocess=False, data_split_shuffle = True,
                session_id = 0)

best_model_1 = compare_models(exclude = ["lar"], n_select = 3)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,sales
2,Target type,Regression
3,Original data shape,"(4422, 166)"
4,Transformed data shape,"(4422, 166)"
5,Transformed train set shape,"(2211, 166)"
6,Transformed test set shape,"(2211, 166)"
7,Numeric features,165


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,6.2047,246.1775,15.3094,0.6832,0.4556,0.564,0.57
et,Extra Trees Regressor,6.4866,269.5358,16.0685,0.6534,0.4601,0.6195,0.268
rf,Random Forest Regressor,6.7824,276.1702,16.1372,0.6441,0.4852,0.6889,0.256
xgboost,Extreme Gradient Boosting,6.6715,277.7125,16.3602,0.6382,0.4814,0.6317,0.236
gbr,Gradient Boosting Regressor,6.9205,283.223,16.3955,0.6354,0.5194,0.7146,0.14
lightgbm,Light Gradient Boosting Machine,6.8525,290.3424,16.6841,0.6223,0.4777,0.6372,0.026
knn,K Neighbors Regressor,8.8327,365.1146,18.8511,0.5221,0.6044,0.9001,0.008
huber,Huber Regressor,9.0209,480.0521,21.2939,0.3416,0.6374,0.9385,0.066
dt,Decision Tree Regressor,9.7269,521.3529,22.7216,0.3061,0.6327,0.7646,0.012
ada,AdaBoost Regressor,19.041,601.6327,24.4009,0.1913,1.2465,3.7481,0.068


In [66]:
for i in best_model_1:
    print("Model Type: ", i)
    print("Hyperparamters: ", i.get_params())
    print("+++++++++++++++++++++++++++++++++")

Model Type:  <catboost.core.CatBoostRegressor object at 0x1538d0550>
Hyperparamters:  {'loss_function': 'RMSE', 'border_count': 254, 'verbose': False, 'task_type': 'CPU', 'random_state': 0}
+++++++++++++++++++++++++++++++++
Model Type:  ExtraTreesRegressor(n_jobs=-1, random_state=0)
Hyperparamters:  {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
+++++++++++++++++++++++++++++++++
Model Type:  RandomForestRegressor(n_jobs=-1, random_state=0)
Hyperparamters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_sa

### **Type 2: Not Standardized, Log Sales**

In [67]:
#train_data_l, test_data_l = train_test_split(data_l, test_size=0.2, random_state=42, shuffle=True)

kf = KFold(n_splits=5)

setup_2 = setup(data = data_l, test_data = data_l, target = 'sales', 
                fold_strategy = kf, transform_target = False, preprocess=False, data_split_shuffle = True,
                session_id = 0)

best_model_2 = compare_models(exclude = ["lar"], n_select = 3)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,sales
2,Target type,Regression
3,Original data shape,"(4422, 166)"
4,Transformed data shape,"(4422, 166)"
5,Transformed train set shape,"(2211, 166)"
6,Transformed test set shape,"(2211, 166)"
7,Numeric features,165


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.307,0.1664,0.4065,0.8704,0.1599,0.1586,0.596
lightgbm,Light Gradient Boosting Machine,0.3215,0.1809,0.424,0.8593,0.1633,0.1684,0.028
et,Extra Trees Regressor,0.3305,0.1853,0.4298,0.8558,0.1668,0.1741,0.256
gbr,Gradient Boosting Regressor,0.3327,0.1945,0.44,0.8487,0.1685,0.173,0.136
xgboost,Extreme Gradient Boosting,0.3334,0.1965,0.4421,0.8472,0.1727,0.1757,0.232
rf,Random Forest Regressor,0.3493,0.2226,0.4701,0.8267,0.1853,0.1823,0.258
ada,AdaBoost Regressor,0.4266,0.2968,0.5441,0.769,0.2156,0.2186,0.074
dt,Decision Tree Regressor,0.5119,0.4719,0.686,0.6328,0.2657,0.2711,0.012
knn,K Neighbors Regressor,0.5191,0.4796,0.6915,0.6266,0.2699,0.2745,0.01
ridge,Ridge Regression,0.5994,0.8249,0.8881,0.3591,0.3029,0.3092,0.008


In [68]:
for i in best_model_2:
    print("Model Type: ", i)
    print("Hyperparamters: ", i.get_params())
    print("+++++++++++++++++++++++++++++++++")

Model Type:  <catboost.core.CatBoostRegressor object at 0x152dde0a0>
Hyperparamters:  {'loss_function': 'RMSE', 'border_count': 254, 'verbose': False, 'task_type': 'CPU', 'random_state': 0}
+++++++++++++++++++++++++++++++++
Model Type:  LGBMRegressor(random_state=0)
Hyperparamters:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 0, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
+++++++++++++++++++++++++++++++++
Model Type:  ExtraTreesRegressor(n_jobs=-1, random_state=0)
Hyperparamters:  {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease

### **Type 3: Standardized, Sales Unchanged**

In [69]:
#train_data_s, test_data_s = train_test_split(data_s, test_size=0.2, random_state=42, shuffle=True)

kf = KFold(n_splits=5)

setup_3 = setup(data = data_s, test_data = data_s, target = 'sales', 
                fold_strategy = kf, transform_target = False, preprocess=False, data_split_shuffle = True,
                session_id = 0)

best_model_3 = compare_models(exclude = ["lar"], n_select = 3)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,sales
2,Target type,Regression
3,Original data shape,"(4422, 166)"
4,Transformed data shape,"(4422, 166)"
5,Transformed train set shape,"(2211, 166)"
6,Transformed test set shape,"(2211, 166)"
7,Numeric features,165


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,6.2046,246.1771,15.3094,0.6832,0.4556,0.564,0.556
et,Extra Trees Regressor,6.4866,269.5358,16.0685,0.6534,0.4601,0.6195,0.302
rf,Random Forest Regressor,6.7842,276.2141,16.1384,0.6441,0.4854,0.6894,0.302
xgboost,Extreme Gradient Boosting,6.6722,277.7497,16.3616,0.6381,0.4814,0.6317,0.254
gbr,Gradient Boosting Regressor,6.9208,283.2277,16.3957,0.6354,0.5194,0.7147,0.16
lightgbm,Light Gradient Boosting Machine,6.859,290.4653,16.6879,0.6222,0.4778,0.6374,0.05
en,Elastic Net,10.6874,463.687,21.4759,0.381,0.7866,1.4737,0.028
huber,Huber Regressor,8.5689,493.9272,21.289,0.3195,0.626,0.9514,0.094
dt,Decision Tree Regressor,9.726,521.3493,22.7215,0.3061,0.6327,0.7644,0.038
llar,Lasso Least Angle Regression,10.309,562.0998,23.0812,0.2326,0.749,1.3237,0.034


In [70]:
## Parameters
for i in best_model_3:
    print("Model Type: ", i)
    print("Hyperparamters: ", i.get_params())
    print("+++++++++++++++++++++++++++++++++")

Model Type:  <catboost.core.CatBoostRegressor object at 0x154587e50>
Hyperparamters:  {'loss_function': 'RMSE', 'border_count': 254, 'verbose': False, 'task_type': 'CPU', 'random_state': 0}
+++++++++++++++++++++++++++++++++
Model Type:  ExtraTreesRegressor(n_jobs=-1, random_state=0)
Hyperparamters:  {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}
+++++++++++++++++++++++++++++++++
Model Type:  RandomForestRegressor(n_jobs=-1, random_state=0)
Hyperparamters:  {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_sa

### **Type 4: Standardized, Log Sales**

In [71]:
#train_data_sl, test_data_sl = train_test_split(data_sl, test_size=0.2, random_state=42, shuffle=True)

kf = KFold(n_splits=5)

setup_4 = setup(data = data_sl, test_data = data_sl, target = 'sales', 
                fold_strategy = kf, transform_target = False, preprocess=False, data_split_shuffle = True,
                session_id = 0)

best_model_4 = compare_models(exclude = ["lar"], n_select = 3)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,sales
2,Target type,Regression
3,Original data shape,"(4422, 166)"
4,Transformed data shape,"(4422, 166)"
5,Transformed train set shape,"(2211, 166)"
6,Transformed test set shape,"(2211, 166)"
7,Numeric features,165


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.307,0.1664,0.4064,0.8704,0.1599,0.1586,0.034
lightgbm,Light Gradient Boosting Machine,0.3215,0.181,0.4241,0.8592,0.1633,0.1684,0.05
et,Extra Trees Regressor,0.3305,0.1853,0.4298,0.8558,0.1668,0.1741,0.05
gbr,Gradient Boosting Regressor,0.3326,0.1945,0.44,0.8488,0.1684,0.173,0.034
xgboost,Extreme Gradient Boosting,0.3333,0.1963,0.4418,0.8474,0.1727,0.1757,0.03
rf,Random Forest Regressor,0.3495,0.2228,0.4703,0.8265,0.1854,0.1824,0.046
ada,AdaBoost Regressor,0.4264,0.2985,0.5451,0.7677,0.2161,0.217,0.038
dt,Decision Tree Regressor,0.5109,0.4704,0.685,0.6339,0.2645,0.27,0.034
ridge,Ridge Regression,0.6012,0.8284,0.8905,0.3563,0.3042,0.3112,0.036
br,Bayesian Ridge,0.6219,0.8441,0.9038,0.3439,0.3101,0.3196,0.032


In [72]:
## Parameters
for i in best_model_4:
    print("Model Type: ", i)
    print("Hyperparamters: ", i.get_params())
    print("+++++++++++++++++++++++++++++++++")

Model Type:  <catboost.core.CatBoostRegressor object at 0x15238cdf0>
Hyperparamters:  {'loss_function': 'RMSE', 'border_count': 254, 'verbose': False, 'task_type': 'CPU', 'random_state': 0}
+++++++++++++++++++++++++++++++++
Model Type:  LGBMRegressor(random_state=0)
Hyperparamters:  {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 0, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
+++++++++++++++++++++++++++++++++
Model Type:  ExtraTreesRegressor(n_jobs=-1, random_state=0)
Hyperparamters:  {'bootstrap': False, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease

### **Conduct Feature Selection**

In [76]:
kf = KFold(n_splits=5)

setup_2a = setup(data = data_l, test_data = data_l, target = 'sales', 
                 fold_strategy = kf, transform_target = False, preprocess=False, data_split_shuffle = True,
                 feature_selection=True,
                 session_id = 0)

best_model_2a = compare_models(exclude = ["lar"], n_select = 3)

Unnamed: 0,Description,Value
0,Session id,0
1,Target,sales
2,Target type,Regression
3,Original data shape,"(4422, 166)"
4,Transformed data shape,"(4422, 166)"
5,Transformed train set shape,"(2211, 166)"
6,Transformed test set shape,"(2211, 166)"
7,Numeric features,165


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.307,0.1664,0.4065,0.8704,0.1599,0.1586,0.014
lightgbm,Light Gradient Boosting Machine,0.3215,0.1809,0.424,0.8593,0.1633,0.1684,0.026
et,Extra Trees Regressor,0.3305,0.1853,0.4298,0.8558,0.1668,0.1741,0.026
gbr,Gradient Boosting Regressor,0.3327,0.1945,0.44,0.8487,0.1685,0.173,0.01
xgboost,Extreme Gradient Boosting,0.3334,0.1965,0.4421,0.8472,0.1727,0.1757,0.008
rf,Random Forest Regressor,0.3493,0.2226,0.4701,0.8267,0.1853,0.1823,0.022
ada,AdaBoost Regressor,0.4266,0.2968,0.5441,0.769,0.2156,0.2186,0.012
dt,Decision Tree Regressor,0.5119,0.4719,0.686,0.6328,0.2657,0.2711,0.01
knn,K Neighbors Regressor,0.5191,0.4796,0.6915,0.6266,0.2699,0.2745,0.01
ridge,Ridge Regression,0.5994,0.8249,0.8881,0.3591,0.3029,0.3092,0.006
