In [3]:
import math
import numpy  as np
import pandas as pd
import random
import pickle
import warnings
import inflection
import seaborn as sns

from scipy                 import stats  as ss
from matplotlib            import pyplot as plt
from IPython.display       import Image
from IPython.core.display  import HTML


from sklearn.metrics       import mean_absolute_error, mean_squared_error
from sklearn.ensemble      import RandomForestRegressor
from sklearn.linear_model  import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

warnings.filterwarnings( 'ignore' )

In [4]:
df6 = pd.read_csv('data/df5_transformado.csv')

## 06 - FEATURE SELECTION

### 6.1 Split dataframe into training and test dataset

In [5]:
cols_drop = ['week_of_year', 'day', 'month', 'day_of_week', 'promo_since', 'competition_since', 'year_week' ]
df6 = df6.drop( cols_drop, axis=1 )

In [7]:
# training dataset
X_train = df6[df6['date'] < '2015-06-19']
y_train = X_train['sales']

# test dataset
X_test = df6[df6['date'] >= '2015-06-19']
y_test = X_test['sales']

print( 'Training Min Date: {}'.format( X_train['date'].min() ) )
print( 'Training Max Date: {}'.format( X_train['date'].max() ) )

print( '\nTest Min Date: {}'.format( X_test['date'].min() ) )
print( 'Test Max Date: {}'.format( X_test['date'].max() ) )

X_train.to_csv('data/X_train.csv', index= False)
y_train.to_csv('data/y_train.csv', index= False)

X_test.to_csv('data/X_test.csv', index= False)
y_test.to_csv('data/y_test.csv', index= False)

Training Min Date: 2013-01-01
Training Max Date: 2015-06-18

Test Min Date: 2015-06-19
Test Max Date: 2015-07-31


### 6.2. RFE as Feature Selector

In [14]:
from sklearn.feature_selection import RFE

In [17]:
# training and test dataset for Boruta
X_train_n = X_train.drop( ['date', 'sales'], axis=1 ).values

y_train_n = y_train.values.ravel()

# define RandomForestRegressor
rf = RandomForestRegressor( n_jobs=-1 )

selector = RFE(rf, step=1, verbose = 1)
selector = selector.fit(X_train_n, y_train_n)

Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.


In [41]:
train = X_train.drop( ['date', 'sales'], axis=1 )
selector.get_feature_names_out()
selector.get_support()
train.columns[selector.get_support()]

Index(['store', 'promo', 'store_type', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year',
       'promo2_since_week', 'competition_time_month', 'promo_time_week',
       'day_of_week_sin', 'day_of_week_cos', 'month_cos', 'day_cos'],
      dtype='object')

## 6.3 Manual Feature Selection 

In [48]:
cols_select = ['store',
                'promo',
                'store_type',
                'competition_distance',
                'competition_open_since_month',
                'competition_open_since_year',
                'promo2_since_week',
                'competition_time_month',
                'promo_time_week',
                'day_of_week_sin',
                'day_of_week_cos',
                'month_cos',
                'month_sin',
                'day_sin'
                'day_cos']
len(cols_select)

14

In [47]:
# Comparando a seleção do Boruta com o RFE

# cols select do boruta
cols_selected_boruta = [
    'store',
    'promo',
    'store_type',
    'assortment',
    'competition_distance',
    'competition_open_since_month',
    'competition_open_since_year',
    'promo2',
    'promo2_since_week',
    'promo2_since_year',
    'competition_time_month',
    'promo_time_week',
    'day_of_week_sin',
    'day_of_week_cos',
    'month_sin',
    'month_cos',
    'day_sin',
    'day_cos',
    'week_of_year_sin',
    'week_of_year_cos']

iguais = list(set(cols_select) & set(cols_selected_boruta))
diferentes = list(set(cols_selected_boruta) - set(cols_select))
print('Quantidade de colunas iguais: {}'.format(len(iguais)))
print('Quantidade de colunas diferentes: {}'.format(len(diferentes)))


Quantidade de colunas iguais: 13
Quantidade de colunas diferentes: 7
