## 0.0 - IMPORTS

In [1]:
import pandas as pd
import numpy as np
import inflection

### 0.1 - Helper Functions

### 0.2 - Loading Data

In [2]:
df_train_raw = pd.read_csv('data/train.csv', low_memory=False)
df_store_raw = pd.read_csv('data/store.csv', low_memory=False)

# Merge dos dois Dataframes acima
df_raw = pd.merge(df_store_raw, df_train_raw, how='left', on='Store')

In [3]:
df_raw.sample()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
399859,439,a,a,1350.0,9.0,2009.0,0,,,,5,2014-03-07,8180,1012,1,1,0,0


## 1.0 - DATA DESCRIPTION

### 1.1 - Rename Columns

In [4]:
df1 = df_raw

In [5]:
cols_old = df1.columns

snakecase = lambda x : inflection.underscore(x)

cols_new = list(map(snakecase, cols_old))

# Rename
df1.columns = cols_new

In [6]:
df1.columns

Index(['store', 'store_type', 'assortment', 'competition_distance',
       'competition_open_since_month', 'competition_open_since_year', 'promo2',
       'promo2_since_week', 'promo2_since_year', 'promo_interval',
       'day_of_week', 'date', 'sales', 'customers', 'open', 'promo',
       'state_holiday', 'school_holiday'],
      dtype='object')

### 1.2 - Data Dimensions

In [7]:
print(f'Number of rows: {df1.shape[0]}')
print(f'Number of columns: {df1.shape[1]}') 

Number of rows: 1017209
Number of columns: 18


### 1.3 - Data Types

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   store                         1017209 non-null  int64  
 1   store_type                    1017209 non-null  object 
 2   assortment                    1017209 non-null  object 
 3   competition_distance          1014567 non-null  float64
 4   competition_open_since_month  693861 non-null   float64
 5   competition_open_since_year   693861 non-null   float64
 6   promo2                        1017209 non-null  int64  
 7   promo2_since_week             509178 non-null   float64
 8   promo2_since_year             509178 non-null   float64
 9   promo_interval                509178 non-null   object 
 10  day_of_week                   1017209 non-null  int64  
 11  date                          1017209 non-null  object 
 12  sales                       

In [9]:
df1.date = pd.to_datetime(df1.date) 

In [10]:
df1.dtypes

store                                    int64
store_type                              object
assortment                              object
competition_distance                   float64
competition_open_since_month           float64
competition_open_since_year            float64
promo2                                   int64
promo2_since_week                      float64
promo2_since_year                      float64
promo_interval                          object
day_of_week                              int64
date                            datetime64[ns]
sales                                    int64
customers                                int64
open                                     int64
promo                                    int64
state_holiday                           object
school_holiday                           int64
dtype: object

### 1.4 - Check NA

In [11]:
df1.isna().sum()

store                                0
store_type                           0
assortment                           0
competition_distance              2642
competition_open_since_month    323348
competition_open_since_year     323348
promo2                               0
promo2_since_week               508031
promo2_since_year               508031
promo_interval                  508031
day_of_week                          0
date                                 0
sales                                0
customers                            0
open                                 0
promo                                0
state_holiday                        0
school_holiday                       0
dtype: int64

### 1.5 - Fillout NA

In [12]:
#competition_distance 
# Tentando visualizar o problema na perspectiva do negócio, o fato desta coluna estar com NA, pode indicar que não
# exista competidores próximos. Talvez se aplicarmos uma distância acima da máxima podemos indicar a inexistência 
# de competidores. 
df1.competition_distance.fillna(100000., inplace=True)

#competition_open_since_month 
df1.competition_open_since_month.fillna(df1.date.dt.month, inplace=True)

#competition_open_since_year
df1.competition_open_since_year.fillna(df1.date.dt.year, inplace=True)

#promo2_since_week
df1.promo2_since_week.fillna(df1.date.dt.week, inplace=True)

#promo2_since_year
df1.promo2_since_year.fillna(df1.date.dt.year, inplace=True)

#promo_interval   
month_map = {1: 'Jan',  2: 'Fev',  3: 'Mar',  4: 'Apr',  5: 'May',  6: 'Jun',  7: 'Jul',  8: 'Aug',  9: 'Sep',  10: 'Oct', 11: 'Nov', 12: 'Dec'}

df1['promo_interval'].fillna(0, inplace=True )

df1['month_map'] = df1['date'].dt.month.map( month_map )

df1['is_promo'] = df1[['promo_interval', 'month_map']].apply( lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split( ',' ) else 0, axis=1 )


  df1.promo2_since_week.fillna(df1.date.dt.week, inplace=True)


In [15]:
df1.sample(2).T

Unnamed: 0,100035,165834
store,110,184
store_type,a,d
assortment,c,c
competition_distance,46590,3560
competition_open_since_month,4,5
competition_open_since_year,2013,2015
promo2,0,0
promo2_since_week,14,21
promo2_since_year,2014,2015
promo_interval,0,0


### 1.6 - Change Types

### 1.7 - Descriptive Statistical

### 1.7.1 - Numerical Attibutes

### 1.7.2 - Categorical Attibutes