## 0.0 - IMPORTAÇÕES DE PACOTES E MÓDULOS

In [1]:
import pandas as pd
import numpy as np
import inflection

## 1.0 - DESCRIÇÃO DOS DADOS

### 1.1 - Carregando os dados

In [2]:
df_train_raw = pd.read_csv('data/train.csv', low_memory=True)
df_store_raw = pd.read_csv('data/store.csv', low_memory=True) 

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### 1.2 - Juntando os dois arquivos em um único Dataframe

In [3]:
df_raw = pd.merge(df_train_raw, df_store_raw, how='left', on='Store')

In [4]:
# Copiando o Dataframe df_raw para df1
df1 = df_raw.copy()

In [5]:
df1.head(2)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"


In [6]:
# Colocando as colunas em snakecase
cols_old = df1.columns

cols_new = list(map(lambda x : inflection.underscore(x), cols_old))

df1.columns = cols_new

In [7]:
df1.head(1)

Unnamed: 0,store,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,


### 1.3 - Verificando as dimensões do Dataframe

In [8]:
print(f'Número de linhas: {df1.shape[0]}')
print(f'Número de colunas: {df1.shape[1]}') 

Número de linhas: 1017209
Número de colunas: 18


### 1.4 - Verificando valores missing

In [9]:
df1.isna().sum()

store                                0
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
dtype: int64

### 1.5 - Alterando o tipo da variável 'date'

In [11]:
df1.date = pd.to_datetime(df1.date)

### 1.6 - Substituindo valores NA

In [16]:
# competition_distance SUBSTITUINDO OS VALORES NA QUE PODEM SIGNIFICAR QUE NÃO EXISTEM CONCORRENTES.
df1.competition_distance = df1.competition_distance.fillna(200000.0)

# competition_open_since_month SUBSTITUINDO PELO MÊS DA DATA DE ABERTURA DA LOJA
df1.competition_open_since_month = df1.competition_open_since_month.fillna(df1.date.dt.month)

# competition_open_since_year SUBSTITUINDO PELO ANO DA DATA DE ABERTURA DA LOJA  
df1.competition_open_since_year = df1.competition_open_since_year.fillna(df1.date.dt.year)

# promo2_since_week SUBSTITUINDO PELA DIA DA SEMANA DA DATA DE ABERTURA DA LOJA 
df1.promo2_since_week = df1.promo2_since_week.fillna(df1.date.dt.week)

# promo2_since_year SUBSTITUINDO PELO ANO DA DATA DE ABERTURA DA LOJA 
df1.promo2_since_year = df1.promo2_since_year.fillna(df1.date.dt.year)

# promo_interval  
df1.promo_interval = df1.promo_interval.fillna(0)

# Criando uma coluna chamada month_map e preenchendo com os meses do ano
month_map = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec', }

df1['month_map'] = df1.date.dt.month.map(month_map)

# Criando uma coluna chamada is_promo 
df1['is_promo'] = df1[['promo_interval', 'month_map']].apply( lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split( ',' ) else 0, axis=1 )


  df1.promo2_since_week = df1.promo2_since_week.fillna(df1.date.dt.week)


In [17]:
df1.head()

Unnamed: 0,store,day_of_week,date,sales,customers,open,promo,state_holiday,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,month_map,is_promo
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,31.0,2015.0,0,Jul,0
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",Jul,1
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct",Jul,1
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9.0,2009.0,0,31.0,2015.0,0,Jul,0
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4.0,2015.0,0,31.0,2015.0,0,Jul,0


### 1.7 - Verificando o Tipo dos dados

In [18]:
df1.dtypes

store                                    int64
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
month_map                               object
is_promo                                 int64
dtype: object

### 1.8 - Alterando os tipos das variáveis

In [19]:
# competition_open_since_month 
df1.competition_open_since_month = df1.competition_open_since_month.astype('int64')

# competition_open_since_year
df1.competition_open_since_year = df1.competition_open_since_year.astype('int64')

# promo2_since_week
df1.promo2_since_week = df1.promo2_since_week.astype('int64')

# promo2_since_year
df1.promo2_since_year = df1.promo2_since_year.astype('int64')

In [20]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 20 columns):
 #   Column                        Non-Null Count    Dtype         
---  ------                        --------------    -----         
 0   store                         1017209 non-null  int64         
 1   day_of_week                   1017209 non-null  int64         
 2   date                          1017209 non-null  datetime64[ns]
 3   sales                         1017209 non-null  int64         
 4   customers                     1017209 non-null  int64         
 5   open                          1017209 non-null  int64         
 6   promo                         1017209 non-null  int64         
 7   state_holiday                 1017209 non-null  object        
 8   school_holiday                1017209 non-null  int64         
 9   store_type                    1017209 non-null  object        
 10  assortment                    1017209 non-null  object        
 11

## 2.0 - FEATURE ENGINEER

In [21]:
df2 = df1

## 3.0 - FILTRAGEM DE VARIÁVEIS

## 4.0 - ANÁLISE EXPLORATÓRIA DE DADOS (EDA)

## 5.0 - PREPARAÇÃO DOS DADOS

## 6.0 - SELEÇÃO DAS VARIÁVEIS COM ALGORITMO

## 7.0 - MODELO DE MACHINE LEARNING

## 8.0 - AJUSTE DE HIPERPARÂMETROS

## 9.0 - INTERPRETAÇÃO E TRADUÇÃO DO ERRO

## 10.0 - DEPLOY DO MODELO EM PRODUÇÃO