# Etapa 3 - Feature Engineering


In [1]:
# Importando as bibliotecas
import pandas as pd
import seaborn as sns
# Configurando o matplotlib
%matplotlib inline                              
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
# Carregando a base de dados
df = pd.read_csv(r'../data/sales_db_02.csv')

In [3]:
# Avaliando as informações do dataframe após a transformação
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98795 entries, 0 to 98794
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sale_date              98795 non-null  object 
 1   product_category_name  98795 non-null  object 
 2   order_units            98795 non-null  int64  
 3   unit_price             98795 non-null  float64
 4   customer_state         98795 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 3.8+ MB


In [4]:
# Visualizando a base
df.head(2)

Unnamed: 0,sale_date,product_category_name,order_units,unit_price,customer_state
0,2017-10-02,utilidades_domesticas,1,29.99,SP
1,2018-07-24,perfumaria,1,118.7,BA


In [5]:
# Desconsiderando o horário das datas
dates = []
for date in df['sale_date'].values:
    dates.append(date[:10])

df['sale_date'] =  dates

In [6]:
#  Transformando a coluna de data para datetime
df['sale_date'] = pd.to_datetime(df['sale_date'])

In [7]:
# Criando uma base de vendas por região
# df = df.groupby(by=['sale_date', 'customer_state', 'product_category_name'], as_index = False).agg({'order_units' : 'sum', 'unit_price' : 'mean'})
df = df.rename(columns={'unit_price':'mean_unit_price'})

# Visualizando a base
df.head(5)

Unnamed: 0,sale_date,product_category_name,order_units,mean_unit_price,customer_state
0,2017-10-02,utilidades_domesticas,1,29.99,SP
1,2018-07-24,perfumaria,1,118.7,BA
2,2018-08-08,automotivo,1,159.9,GO
3,2017-11-18,pet_shop,1,45.0,RN
4,2018-02-13,papelaria,1,19.9,SP


In [8]:
# Avaliando o número de categorias
print(r'Número de categorias: {}'.format(df['product_category_name'].nunique()))

Número de categorias: 73


- Existem 73 categorias diferentes de produtos na base e não é adequado para carregar todas essas infromações para o modelo, já que aumentaria muito a dimensão da base de dados. Por isso, foram selecionadas as categorias que representam 80 % dos registros de venda para serem mantidas e as categorias restantes serão agrupadas em um único grupo.

In [9]:
# Explorando a coluna de cateogorias de produtos
df.product_category_name.value_counts()

product_category_name
cama_mesa_banho                                  10009
beleza_saude                                      8830
esporte_lazer                                     7666
informatica_acessorios                            6724
moveis_decoracao                                  6636
                                                 ...  
portateis_cozinha_e_preparadores_de_alimentos       13
cds_dvds_musicais                                   12
pc_gamer                                             7
fashion_roupa_infanto_juvenil                        7
seguros_e_servicos                                   2
Name: count, Length: 73, dtype: int64

In [10]:
# Avaliando quantas categorias representam 80% das vendas
categories = list(df.product_category_name.value_counts().index)
j = 0     # Inicializando contadores
cat_80_perc = []        # Lista para armazenar as categorias

# Criando lista com as categorias que representam 80% dos registros de venda e quantos registros de venda cada uma tem
for cat in categories:
    if j <= 0.8:
        j = j + (df.loc[df['product_category_name'] == cat, 'product_category_name'].count()/df.shape[0])
        cat_80_perc.append([cat, df.loc[df['product_category_name'] == cat, 'product_category_name'].count()])
        min_freq = df.loc[df['product_category_name'] == cat, 'product_category_name'].count()          # Armazenando a fequência da categoria com menos registros dentro das categorias que representam 80%
    else:
        pass

In [11]:
# Visualizando a lista de categorias
cat_80_perc

[['cama_mesa_banho', 10009],
 ['beleza_saude', 8830],
 ['esporte_lazer', 7666],
 ['informatica_acessorios', 6724],
 ['moveis_decoracao', 6636],
 ['utilidades_domesticas', 5878],
 ['relogios_presentes', 5670],
 ['telefonia', 4178],
 ['automotivo', 3902],
 ['brinquedos', 3896],
 ['cool_stuff', 3584],
 ['ferramentas_jardim', 3537],
 ['perfumaria', 3165],
 ['bebes', 2856],
 ['eletronicos', 2532]]

- Para inserir informação de unidade federativa no modelo(estado brasileiro), é necessário transformar a coluna em formato numérico. Para isso, será aplicada a técnica de codificação frequencial na coluna de estado, que consiste em substituir a sigla do estado pela frequência relacionada à variável target. Essa abordagem é relevante para capturar a correlação entre a feature e a variável target.

In [12]:
# Aplicando a Codificaçõa Frequencial
# Definindo função para codificação frequencial de coluna
def cod_freq(uf):
    return df_freq.loc[df_freq['customer_state'] == uf, 'order_units'].max()

# Calculando a frequência
df_freq = (df[['customer_state', 'order_units']].groupby('customer_state').sum()/df['order_units'].sum()).reset_index()

# Criando uma coluna com a frequência
df['state_freq'] = df['customer_state'].apply(cod_freq)

# Excluindo a coluna original de estado
df = df.drop('customer_state', axis = 1)

In [13]:
# Usando o OneHotEncoder nas colunas de categorias
ohe = OneHotEncoder(handle_unknown='ignore', min_frequency=min_freq)
ohe = ohe.fit(df[['product_category_name']])

In [14]:
# Transformando o resultado em um dataframe
ohe_df = pd.DataFrame(ohe.transform(df[['product_category_name']]).toarray(), columns=ohe.get_feature_names_out())
ohe_df.head(2)

Unnamed: 0,product_category_name_automotivo,product_category_name_bebes,product_category_name_beleza_saude,product_category_name_brinquedos,product_category_name_cama_mesa_banho,product_category_name_cool_stuff,product_category_name_eletronicos,product_category_name_esporte_lazer,product_category_name_ferramentas_jardim,product_category_name_informatica_acessorios,product_category_name_moveis_decoracao,product_category_name_perfumaria,product_category_name_relogios_presentes,product_category_name_telefonia,product_category_name_utilidades_domesticas,product_category_name_infrequent_sklearn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
# Unindo os dataframes e excluindo a coluna original de categorias
df = pd.concat([df, ohe_df], axis=1)

# Excluindo a coluna original de categoria
df = df.drop('product_category_name', axis = 1)

In [16]:
# Visualizando as informações da base de dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98795 entries, 0 to 98794
Data columns (total 20 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   sale_date                                     98795 non-null  datetime64[ns]
 1   order_units                                   98795 non-null  int64         
 2   mean_unit_price                               98795 non-null  float64       
 3   state_freq                                    98795 non-null  float64       
 4   product_category_name_automotivo              98795 non-null  float64       
 5   product_category_name_bebes                   98795 non-null  float64       
 6   product_category_name_beleza_saude            98795 non-null  float64       
 7   product_category_name_brinquedos              98795 non-null  float64       
 8   product_category_name_cama_mesa_banho         98795 non-null  floa

In [17]:
# Criando features
df['month'] = df['sale_date'].dt.month
df['day_of_week'] = df['sale_date'].dt.dayofweek

In [18]:
# Visualizando o dataframe
df.head(3)

Unnamed: 0,sale_date,order_units,mean_unit_price,state_freq,product_category_name_automotivo,product_category_name_bebes,product_category_name_beleza_saude,product_category_name_brinquedos,product_category_name_cama_mesa_banho,product_category_name_cool_stuff,...,product_category_name_ferramentas_jardim,product_category_name_informatica_acessorios,product_category_name_moveis_decoracao,product_category_name_perfumaria,product_category_name_relogios_presentes,product_category_name_telefonia,product_category_name_utilidades_domesticas,product_category_name_infrequent_sklearn,month,day_of_week
0,2017-10-02,1,29.99,0.421719,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10,0
1,2018-07-24,1,118.7,0.033361,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7,1
2,2018-08-08,1,159.9,0.020477,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,2


In [19]:
# Criando Features de data
df['lag_1'] = df['order_units'].shift(1)
df['lag_7'] = df['order_units'].shift(7)  # Venda na mesma semana anterior
df['rolling_mean_7'] = df['order_units'].shift(1).rolling(7).mean()

# Substituindo os dados nulos oriundos da transformação
df['lag_1'] = df['lag_1'].fillna(method='bfill')  # Usa o próximo valor não nulo
df['lag_7'] = df['lag_7'].fillna(method='ffill')  # Usa o último valor conhecido

In [20]:
# Ajustando a ordem das colunas
'''cols = ['sale_date', 'month', 'day_of_week','lag_1', 'lag_7', 'rolling_mean_7',
         'mean_unit_price', 'state_freq',
        'product_category_name_automotivo', 'product_category_name_bebes',
        'product_category_name_beleza_saude',
        'product_category_name_brinquedos',
        'product_category_name_cama_mesa_banho',
        'product_category_name_cool_stuff', 'product_category_name_eletronicos',
        'product_category_name_esporte_lazer',
        'product_category_name_fashion_bolsas_e_acessorios',
        'product_category_name_ferramentas_jardim',
        'product_category_name_informatica_acessorios',
        'product_category_name_moveis_decoracao',
        'product_category_name_moveis_escritorio',
        'product_category_name_papelaria', 'product_category_name_perfumaria',
        'product_category_name_pet_shop',
        'product_category_name_relogios_presentes',
        'product_category_name_telefonia',
        'product_category_name_utilidades_domesticas',
        'product_category_name_infrequent_sklearn', 'month', 'day_of_week',
        'lag_1', 'lag_7', 'rolling_mean_7',
        'order_units']

df = df[cols]'''

"cols = ['sale_date', 'month', 'day_of_week','lag_1', 'lag_7', 'rolling_mean_7',\n         'mean_unit_price', 'state_freq',\n        'product_category_name_automotivo', 'product_category_name_bebes',\n        'product_category_name_beleza_saude',\n        'product_category_name_brinquedos',\n        'product_category_name_cama_mesa_banho',\n        'product_category_name_cool_stuff', 'product_category_name_eletronicos',\n        'product_category_name_esporte_lazer',\n        'product_category_name_fashion_bolsas_e_acessorios',\n        'product_category_name_ferramentas_jardim',\n        'product_category_name_informatica_acessorios',\n        'product_category_name_moveis_decoracao',\n        'product_category_name_moveis_escritorio',\n        'product_category_name_papelaria', 'product_category_name_perfumaria',\n        'product_category_name_pet_shop',\n        'product_category_name_relogios_presentes',\n        'product_category_name_telefonia',\n        'product_category_name_u

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98795 entries, 0 to 98794
Data columns (total 25 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   sale_date                                     98795 non-null  datetime64[ns]
 1   order_units                                   98795 non-null  int64         
 2   mean_unit_price                               98795 non-null  float64       
 3   state_freq                                    98795 non-null  float64       
 4   product_category_name_automotivo              98795 non-null  float64       
 5   product_category_name_bebes                   98795 non-null  float64       
 6   product_category_name_beleza_saude            98795 non-null  float64       
 7   product_category_name_brinquedos              98795 non-null  float64       
 8   product_category_name_cama_mesa_banho         98795 non-null  floa

In [22]:
# Criando dados de produção
df_valid = df.loc[df['sale_date'] >= '08/01/2018']

# Ajustando o df
df = df.loc[df['sale_date'] < '08/01/2018']

In [23]:
# Salvando a base para a próxima etapa
df.to_csv(r'../data/sales_db_03.csv', index = False)
df_valid.to_csv(r'../data/unseen_data.csv', index = False)