In [118]:
#Computação científica
import numpy as np

#análise de dados
import pandas as pd

#machine learning
import sklearn

# feature engineering
from sklearn.impute import SimpleImputer
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer)
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.encoding import  RareLabelEncoder, OrdinalEncoder

In [119]:
sklearn.set_config(display='diagram')
sklearn.set_config(transform_output="pandas")

# Introdução

## Motivação


## Objetivos


# Download Dataset

In [120]:
# O dataset está disponível no seguinte URL:
#https://www.kaggle.com/competitions/porto-seguro-safe-driver-prediction/data

path='/home/rodolfo/Insync/rodolfopcruz2@gmail.com/Google Drive/Estudo/Python_Projects/datasets/porto-seguro-safe-driver-prediction/'
train=pd.read_csv(path+'train.csv')
test= pd.read_csv(path+'test.csv')

# Feature Engineering

- Missing Values
- Distribuição não normal
- Remoção de labels raras nas variáveis categóricas
- Encoding
- Normalização

In [121]:
cat_features=[feature for feature in train.columns if 'cat' in feature]
bin_features=[feature for feature in train.columns if 'bin' in feature]
num_features=[feature for feature in train.columns if 'cat' not in feature 
                                                    and 'bin' not in feature and
                                                    feature!='target' and feature!='id']


## Missing Values

In [122]:
#Missing values estão com valor -1
#Substituir -1 por nan para facilitar a identificação dos valores ausentes
train=train.replace(-1, np.nan)
test =test.replace(-1,np.nan)

In [123]:
#Separar os dados de input dos dados de output

y_train=train['target'] #Resposta esperada (cliente solicitou ou não um seguro)
train=train.drop(columns='target')

### Features Binárias

- Nas colunas com baixa proporção de missing values eles serão substituídos pelo valor mais comum;
- Nas colunas com elevada proporção de missing values eles serão substituídos por uma string para identificar o valor ausente;


In [124]:
#data type das features binárias
train[bin_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   ps_ind_06_bin   595212 non-null  int64
 1   ps_ind_07_bin   595212 non-null  int64
 2   ps_ind_08_bin   595212 non-null  int64
 3   ps_ind_09_bin   595212 non-null  int64
 4   ps_ind_10_bin   595212 non-null  int64
 5   ps_ind_11_bin   595212 non-null  int64
 6   ps_ind_12_bin   595212 non-null  int64
 7   ps_ind_13_bin   595212 non-null  int64
 8   ps_ind_16_bin   595212 non-null  int64
 9   ps_ind_17_bin   595212 non-null  int64
 10  ps_ind_18_bin   595212 non-null  int64
 11  ps_calc_15_bin  595212 non-null  int64
 12  ps_calc_16_bin  595212 non-null  int64
 13  ps_calc_17_bin  595212 non-null  int64
 14  ps_calc_18_bin  595212 non-null  int64
 15  ps_calc_19_bin  595212 non-null  int64
 16  ps_calc_20_bin  595212 non-null  int64
dtypes: int64(17)
memory usage: 77.2 MB


In [125]:
#Converter o data type das features binárias
train[bin_features]=train[bin_features].astype('object')
test[bin_features]=test[bin_features].astype('object')

train[bin_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ps_ind_06_bin   595212 non-null  object
 1   ps_ind_07_bin   595212 non-null  object
 2   ps_ind_08_bin   595212 non-null  object
 3   ps_ind_09_bin   595212 non-null  object
 4   ps_ind_10_bin   595212 non-null  object
 5   ps_ind_11_bin   595212 non-null  object
 6   ps_ind_12_bin   595212 non-null  object
 7   ps_ind_13_bin   595212 non-null  object
 8   ps_ind_16_bin   595212 non-null  object
 9   ps_ind_17_bin   595212 non-null  object
 10  ps_ind_18_bin   595212 non-null  object
 11  ps_calc_15_bin  595212 non-null  object
 12  ps_calc_16_bin  595212 non-null  object
 13  ps_calc_17_bin  595212 non-null  object
 14  ps_calc_18_bin  595212 non-null  object
 15  ps_calc_19_bin  595212 non-null  object
 16  ps_calc_20_bin  595212 non-null  object
dtypes: object(17)
memory usage: 7

In [126]:
#Verificar a presença de missing values
train.loc[:,bin_features].isna().sum()

ps_ind_06_bin     0
ps_ind_07_bin     0
ps_ind_08_bin     0
ps_ind_09_bin     0
ps_ind_10_bin     0
ps_ind_11_bin     0
ps_ind_12_bin     0
ps_ind_13_bin     0
ps_ind_16_bin     0
ps_ind_17_bin     0
ps_ind_18_bin     0
ps_calc_15_bin    0
ps_calc_16_bin    0
ps_calc_17_bin    0
ps_calc_18_bin    0
ps_calc_19_bin    0
ps_calc_20_bin    0
dtype: int64

In [127]:
#Os missing values das features binárias serão substituídos pelo valor mais frequente na coluna

bin_imputer_most_frequent=SimpleImputer(strategy='most_frequent')
bin_imputer_most_frequent.fit(train[bin_features]) #fit nos dados de treino


In [128]:
# usar imputer para transformar dados de treino e de teste

train[bin_features]=bin_imputer_most_frequent.transform(train[bin_features]) #transformnar dados de treino e de teste
test[bin_features] =bin_imputer_most_frequent.transform(test[bin_features])

In [129]:
#formato dos dados após transformação
train[bin_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ps_ind_06_bin   595212 non-null  object
 1   ps_ind_07_bin   595212 non-null  object
 2   ps_ind_08_bin   595212 non-null  object
 3   ps_ind_09_bin   595212 non-null  object
 4   ps_ind_10_bin   595212 non-null  object
 5   ps_ind_11_bin   595212 non-null  object
 6   ps_ind_12_bin   595212 non-null  object
 7   ps_ind_13_bin   595212 non-null  object
 8   ps_ind_16_bin   595212 non-null  object
 9   ps_ind_17_bin   595212 non-null  object
 10  ps_ind_18_bin   595212 non-null  object
 11  ps_calc_15_bin  595212 non-null  object
 12  ps_calc_16_bin  595212 non-null  object
 13  ps_calc_17_bin  595212 non-null  object
 14  ps_calc_18_bin  595212 non-null  object
 15  ps_calc_19_bin  595212 non-null  object
 16  ps_calc_20_bin  595212 non-null  object
dtypes: object(17)
memory usage: 7

In [130]:
#Parametros que serão usados pelo imputer
bin_imputer_most_frequent.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'most_frequent',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
        'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
        'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin',
        'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin',
        'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin'], dtype=object),
 'n_features_in_': 17,
 '_fit_dtype': dtype('O'),
 'indicator_': None,
 'statistics_': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0], dtype=object)}

### Features Categóricas

Duas estratégias para substituição doos missing values:

1) Features com proporção de ausentes inferior ao threshold estipulado:

        Substituição por valor mais comum

2) Features com proporção de ausentes superior ao threshold estipulado:
        
        Substituição dos valores ausentes pela string missing

In [131]:
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [132]:
#proporção de missing values
train[cat_features].isna().mean().sort_values(ascending=False)


ps_car_03_cat    0.690898
ps_car_05_cat    0.447825
ps_car_07_cat    0.019302
ps_ind_05_cat    0.009760
ps_car_09_cat    0.000956
ps_ind_02_cat    0.000363
ps_car_01_cat    0.000180
ps_ind_04_cat    0.000139
ps_car_02_cat    0.000008
ps_car_04_cat    0.000000
ps_car_06_cat    0.000000
ps_car_08_cat    0.000000
ps_car_10_cat    0.000000
ps_car_11_cat    0.000000
dtype: float64

In [133]:
#formato dos dados categóricos
train[cat_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ps_ind_02_cat  594996 non-null  float64
 1   ps_ind_04_cat  595129 non-null  float64
 2   ps_ind_05_cat  589403 non-null  float64
 3   ps_car_01_cat  595105 non-null  float64
 4   ps_car_02_cat  595207 non-null  float64
 5   ps_car_03_cat  183981 non-null  float64
 6   ps_car_04_cat  595212 non-null  int64  
 7   ps_car_05_cat  328661 non-null  float64
 8   ps_car_06_cat  595212 non-null  int64  
 9   ps_car_07_cat  583723 non-null  float64
 10  ps_car_08_cat  595212 non-null  int64  
 11  ps_car_09_cat  594643 non-null  float64
 12  ps_car_10_cat  595212 non-null  int64  
 13  ps_car_11_cat  595212 non-null  int64  
dtypes: float64(9), int64(5)
memory usage: 63.6 MB


In [134]:
#transformação do formato dos dados
train[cat_features]=train[cat_features].astype('object')
test[cat_features] =test[cat_features].astype('object')

In [135]:
missing_threshold=1/100
#features categóricas com número de missing values superior ao thresold
cat_features_muitos_na=train[cat_features].isna().mean()>missing_threshold
cat_features_muitos_na=cat_features_muitos_na[cat_features_muitos_na].index.to_list()

#features categóricas com número de missing values inferior ao thresold
cat_features_poucos_na=train[cat_features].isna().mean()<missing_threshold
cat_features_poucos_na=cat_features_poucos_na[cat_features_poucos_na].index.to_list()

In [136]:
#imputer para features categóricas com elevada proporção de missing values
cat_imputer_muitos_na=SimpleImputer(strategy='constant',fill_value='missing')
cat_imputer_muitos_na.fit(train[cat_features_muitos_na])

In [137]:
#imputer para features categóricas com baixa proporção de missing values
cat_imputer_poucos_na=SimpleImputer(strategy='most_frequent')
cat_imputer_poucos_na.fit(train[cat_features_poucos_na])

In [138]:
np.shape(test)

(892816, 58)

In [139]:
cat_imputer_muitos_na

In [140]:
train[cat_features_muitos_na]=cat_imputer_muitos_na.transform(train[cat_features_muitos_na]) #transformar os dados de treino
test[cat_features_muitos_na]  =cat_imputer_muitos_na.transform(test[cat_features_muitos_na])  #transformar os dados de test


train[cat_features_poucos_na]=cat_imputer_poucos_na.transform(train[cat_features_poucos_na]) #transformar os dados de treino
test[cat_features_poucos_na] =cat_imputer_poucos_na.transform(test[cat_features_poucos_na]) #transformar os dados de treino


train[cat_features].isna().mean()

ps_ind_02_cat    0.0
ps_ind_04_cat    0.0
ps_ind_05_cat    0.0
ps_car_01_cat    0.0
ps_car_02_cat    0.0
ps_car_03_cat    0.0
ps_car_04_cat    0.0
ps_car_05_cat    0.0
ps_car_06_cat    0.0
ps_car_07_cat    0.0
ps_car_08_cat    0.0
ps_car_09_cat    0.0
ps_car_10_cat    0.0
ps_car_11_cat    0.0
dtype: float64

In [141]:
#Parâmetros que serão usados pelo imputer

cat_imputer_muitos_na.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'constant',
 'fill_value': 'missing',
 'copy': True,
 'feature_names_in_': array(['ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat'], dtype=object),
 'n_features_in_': 3,
 '_fit_dtype': dtype('O'),
 'indicator_': None,
 'statistics_': array(['missing', 'missing', 'missing'], dtype=object)}

In [142]:
#Parâmetros que serão usados pelo imputer

cat_imputer_poucos_na.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'most_frequent',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
        'ps_car_02_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_08_cat',
        'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat'], dtype=object),
 'n_features_in_': 11,
 '_fit_dtype': dtype('O'),
 'indicator_': None,
 'statistics_': array([1.0, 0.0, 0.0, 11.0, 1.0, 0, 11, 1, 2.0, 1, 104], dtype=object)}

### Features Numéricas

In [144]:
#proporção de missing values nas features numéricas
train[num_features].isna().mean()

ps_ind_01     0.000000
ps_ind_03     0.000000
ps_ind_14     0.000000
ps_ind_15     0.000000
ps_reg_01     0.000000
ps_reg_02     0.000000
ps_reg_03     0.181065
ps_car_11     0.000008
ps_car_12     0.000002
ps_car_13     0.000000
ps_car_14     0.071605
ps_car_15     0.000000
ps_calc_01    0.000000
ps_calc_02    0.000000
ps_calc_03    0.000000
ps_calc_04    0.000000
ps_calc_05    0.000000
ps_calc_06    0.000000
ps_calc_07    0.000000
ps_calc_08    0.000000
ps_calc_09    0.000000
ps_calc_10    0.000000
ps_calc_11    0.000000
ps_calc_12    0.000000
ps_calc_13    0.000000
ps_calc_14    0.000000
dtype: float64

In [145]:
missing_threshold=1/100

#numeric features com muitos mussing values
num_features_muitos_na=train[num_features].isna().mean()>missing_threshold
num_features_muitos_na=num_features_muitos_na[num_features_muitos_na].index.to_list()

#numeric features com poucos missing values
num_features_poucos_na=train[num_features].isna().mean()<missing_threshold
num_features_poucos_na=num_features_poucos_na[num_features_poucos_na].index.to_list()

In [146]:
#imputer com muitos na


missing_indicator=AddMissingIndicator(variables=num_features_muitos_na)
missing_indicator.fit(train) #fit nos dados de treino



In [147]:
train=missing_indicator.transform(train) #transformar dados de treino e de teste
test =missing_indicator.transform(test)

In [149]:
#Todas os missing values nas features numéricas serão substituídas pela média
num_imputer=SimpleImputer(strategy='mean',)
num_imputer.fit(train[num_features]) #fit nos dados de treino

In [150]:
# Parâmetros que serão usados pelo imputer
num_imputer.__dict__

{'missing_values': nan,
 'add_indicator': False,
 'keep_empty_features': False,
 'strategy': 'mean',
 'fill_value': None,
 'copy': True,
 'feature_names_in_': array(['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01',
        'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13',
        'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03',
        'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07',
        'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
        'ps_calc_12', 'ps_calc_13', 'ps_calc_14'], dtype=object),
 'n_features_in_': 26,
 '_fit_dtype': dtype('float64'),
 'indicator_': None,
 'statistics_': array([1.90037835, 4.42331808, 0.01245103, 7.29992171, 0.61099138,
        0.43918436, 0.89404733, 2.34609976, 0.37994713, 0.81326468,
        0.37469064, 3.06589944, 0.44975639, 0.44958922, 0.44984879,
        2.37208087, 1.88588604, 7.68944511, 3.00582314, 9.22590438,
        2.33903382, 8.43359005, 5.44138223, 1.44191817, 2.8722

In [151]:
train[num_features]=num_imputer.transform(train[num_features]) #transformar dados de treino e de teste
test[num_features] =num_imputer.transform(test[num_features])

train[num_features].isna().mean()

ps_ind_01     0.0
ps_ind_03     0.0
ps_ind_14     0.0
ps_ind_15     0.0
ps_reg_01     0.0
ps_reg_02     0.0
ps_reg_03     0.0
ps_car_11     0.0
ps_car_12     0.0
ps_car_13     0.0
ps_car_14     0.0
ps_car_15     0.0
ps_calc_01    0.0
ps_calc_02    0.0
ps_calc_03    0.0
ps_calc_04    0.0
ps_calc_05    0.0
ps_calc_06    0.0
ps_calc_07    0.0
ps_calc_08    0.0
ps_calc_09    0.0
ps_calc_10    0.0
ps_calc_11    0.0
ps_calc_12    0.0
ps_calc_13    0.0
ps_calc_14    0.0
dtype: float64

## Transformações

- Aplicação da transformação de Yeo-Jhoson as seguintes variáveis:

    - ps_reg_03
    - ps_car_12
    - ps_car_13
    - ps_car_14
    - ps_car_15
    - ps_reg_02


In [153]:
continuous_numerical_variables=['ps_reg_03','ps_car_12','ps_car_13','ps_car_14','ps_car_15','ps_reg_02'] #features foram selecionadas durante análise exploratória
yeo_transformer=YeoJohnsonTransformer(variables=continuous_numerical_variables)

In [154]:
train=yeo_transformer.fit_transform(train) #fit nos dados de treino
test=yeo_transformer.transform(test)

In [155]:
#Parâmetros que serão usado pleo transformer
yeo_transformer.lambda_dict_

{'ps_reg_03': np.float64(-1.5645071507026156),
 'ps_car_12': np.float64(-5.9072831878455165),
 'ps_car_13': np.float64(-2.5269376303019384),
 'ps_car_14': np.float64(-3.162784028225431),
 'ps_car_15': np.float64(4.086997260528258),
 'ps_reg_02': np.float64(-1.3900957069229347)}

## Labels Raras

Agrupar labels que aparecem em pequena proporção em cada feature categórica

In [158]:
rare_threshold=1/100
#Todas as labels que aparecem em proproção inferior a rare_thrshold serão agrupadas como uma única

#Identificar fearues que contem labels raras
rare_labels=[]
for feature in cat_features:
    if  not ((train[feature].value_counts()/len(train))>rare_threshold).all():
        rare_labels.append(feature)

In [159]:
rare_labels

['ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_04_cat',
 'ps_car_06_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [160]:
rare_encoder = RareLabelEncoder(tol=rare_threshold, n_categories=1, variables=rare_labels)

rare_encoder.fit(train)



In [161]:
#Transformar dados de treino e de teste
train=rare_encoder.transform(train)
test=rare_encoder.transform(test)

In [162]:
train[cat_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   ps_ind_02_cat  595212 non-null  object
 1   ps_ind_04_cat  595212 non-null  object
 2   ps_ind_05_cat  595212 non-null  object
 3   ps_car_01_cat  595212 non-null  object
 4   ps_car_02_cat  595212 non-null  object
 5   ps_car_03_cat  595212 non-null  object
 6   ps_car_04_cat  595212 non-null  object
 7   ps_car_05_cat  595212 non-null  object
 8   ps_car_06_cat  595212 non-null  object
 9   ps_car_07_cat  595212 non-null  object
 10  ps_car_08_cat  595212 non-null  object
 11  ps_car_09_cat  595212 non-null  object
 12  ps_car_10_cat  595212 non-null  object
 13  ps_car_11_cat  595212 non-null  object
dtypes: object(14)
memory usage: 63.6+ MB
