In [2]:
import math
import numpy  as np
import pandas as pd
import random
import pickle
import warnings
import inflection
import seaborn as sns

from scipy                 import stats  as ss
from matplotlib            import pyplot as plt
from IPython.display       import Image
from IPython.core.display  import HTML


from sklearn.metrics       import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from src.tools             import ml_error
from src.tools             import timeSeries_CV
import xgboost as xgb

warnings.filterwarnings( 'ignore' )

In [3]:
# Lendo o split gerado na etapa 6
X_train = pd.read_csv('data/X_train.csv')
y_train = pd.read_csv('data/y_train.csv')

X_test = pd.read_csv('data/X_test.csv')
y_test = pd.read_csv('data/y_test.csv')

## 6.3 Manual Feature Selection 

In [4]:
cols_select = ['store',
                'promo',
                'store_type',
                'competition_distance',
                'competition_open_since_month',
                'competition_open_since_year',
                'promo2_since_week',
                'competition_time_month',
                'promo_time_week',
                'day_of_week_sin',
                'day_of_week_cos',
                'month_cos',
                'month_sin',
                'day_sin',
                'day_cos']

## 07 - MACHINE LEARNING MODELLING

In [5]:
X_train = X_train[cols_select]
X_test = X_test[cols_select]

### 7.1 Avarage Model

In [6]:
aux1 = X_test.copy()
aux1['sales'] = y_test.copy()
aux1
#prediction
aux2 = aux1[['store', 'sales']].groupby('store').mean().reset_index().rename(columns={'sales': 'predictions'})
aux1 = pd.merge(aux1, aux2, how = 'left', on = 'store')

yhat_baseline = aux1['predictions']

#performance

baseline_result = ml_error(' Average Model', np.expm1(y_test), np.expm1(yhat_baseline))
baseline_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Average Model,1354.800353,0.2064,1835.135542


In [7]:
# Teste com a média do passado e prevendo o futuro

aux1 = X_train.copy()
aux1['sales'] = y_train.copy()
aux1
#prediction
aux2 = aux1[['store', 'sales']].groupby('store').mean().reset_index().rename(columns={'sales': 'predictions'})
aux1 = pd.merge(X_test, aux2, how = 'left', on = 'store')

yhat_baseline = aux1['predictions']

#performance

baseline_result = ml_error(' Average Model', np.expm1(y_test), np.expm1(yhat_baseline))
baseline_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Average Model,1429.763326,0.216814,1939.32873


In [32]:
# Concaternar para o Cross Validation

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

X.to_csv('data/X.csv', index = False)
y.to_csv('data/y.csv', index = False)

#Criar um rastreador de datasets

datasets_dics = [{'nome': 'sample_submission', 'Descricao': 'Dados para submissão no kaggle'},
                 {'nome': 'store', 'Descricao': 'Tabela contendo informações sobre as lojas'},
                 {'nome':'test', 'Descricao': 'Dados para test'},
                 {'nome':'train', 'Descricao': 'Dados para train'},
                 {'nome':'df5_transformado', 'Descricao':' Dados que sofreram transformações para o treino de ML (Robust Scaller, Encoding, transformacao de natureza e transformaçao da variavel target por log)'},
                 {'nome': 'X_train', 'Descricao': "Dados filtrado para treino (X_train = df6[df6['date'] < '2015-06-19']), copia do df5_transformado"},
                 {'nome':'X_test', 'Descricao': "Dados filtrado para test (X_test = df6[df6['date'] >= '2015-06-19']), copia do df5_transformado"},
                 {'nome':'y_train', 'Descricao': 'Dados com valores de vendas para treino, copia do df5_transformado'},
                 {'nome':'y_test', 'Descricao': 'Dados com valores de vendas para test, copia do df5_transformado'},
                 {'nome':'X', 'Descricao': 'Dataset sendo a uniao dos datasets X_train e X_test, para aplicação do cross-validation'},
                 {'nome':'y', 'Descricao': 'Dataset sendo a uniao dos datasets y_train e y_test, para aplicação do cross-validation'}
                 ]

datasets_info = pd.DataFrame(datasets_dics)
datasets_info.to_csv('data/datasets_info.csv', index = False)

In [9]:
len(X.columns)

15

### 7.2. Linear Regression Model

In [10]:
# model
lr = LinearRegression()

#performance
lr_result = timeSeries_CV(X, y, model_name='Linear Regression', model=lr, kfold=5)
lr_result

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression,2045.67 +/- 107.91,0.31 +/- 0.02,2974.36 +/- 191.87


### 7.3. Linear Regression Regularized Model - Lasso


In [11]:
# Model
lrr = Lasso(alpha= 0.001)

#Performance
lrr_result = timeSeries_CV(X, y, model_name='Linear Regression - Lasso', model=lrr, kfold=5)
lrr_result

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Linear Regression - Lasso,2019.43 +/- 83.61,0.32 +/- 0.01,2880.73 +/- 153.68


### 7.4. Random Forest Regressor

In [12]:
# model
rf = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=42 )


# performance
rf_result = timeSeries_CV(X, y, model_name='Random Forest Regressor', model=rf, kfold=5)
rf_result

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,Random Forest Regressor,957.11 +/- 219.69,0.15 +/- 0.03,1507.6 +/- 318.76


### 7.5. XGBoost Regressor

In [13]:
# model
model_xgb = xgb.XGBRegressor()



# performance
xgb_result = timeSeries_CV(X, y, model_name= 'XGBoost Regressor', model=model_xgb, kfold=5)
xgb_result

Unnamed: 0,Model Name,MAE CV,MAPE CV,RMSE CV
0,XGBoost Regressor,1119.59 +/- 176.27,0.17 +/- 0.02,1603.97 +/- 277.82


### 7.6 Compare Model's Performance

In [33]:
modelling_result = pd.concat( [lr_result, lrr_result, rf_result, xgb_result] )
modelling_result.sort_values( 'RMSE CV' )
modelling_result.to_csv( 'data/modelling_result.csv')


In [34]:
# Adicionar tabela de performance na tabela de dataset_info

row = {'nome': 'modelling_result', 'Descricao': 'tabela com a performance dos modelos'}
datasets_info = pd.concat([datasets_info, pd.DataFrame([row])], ignore_index=True)
datasets_info.to_csv('data/datasets_info.csv', index= False)

In [35]:
datasets_info

Unnamed: 0,nome,Descricao
0,sample_submission,Dados para submissão no kaggle
1,store,Tabela contendo informações sobre as lojas
2,test,Dados para test
3,train,Dados para train
4,df5_transformado,Dados que sofreram transformações para o trei...
5,X_train,Dados filtrado para treino (X_train = df6[df6[...
6,X_test,Dados filtrado para test (X_test = df6[df6['da...
7,y_train,"Dados com valores de vendas para treino, copia..."
8,y_test,"Dados com valores de vendas para test, copia d..."
9,X,Dataset sendo a uniao dos datasets X_train e X...
