In [31]:
#Computação científica
import numpy as np

#análise de dados
import pandas as pd

#visualização
import matplotlib.pyplot as plt

#machine learning
import sklearn

# feature engineering
from sklearn.impute import SimpleImputer
from feature_engine.imputation import (
    AddMissingIndicator)
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.encoding import  RareLabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper


#Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [3]:
sklearn.set_config(display='diagram')
sklearn.set_config(transform_output="pandas")

# Introdução

## Motivação


## Objetivos


# Download Dataset

In [4]:
# O dataset está disponível no seguinte URL:
#https://www.kaggle.com/competitions/porto-seguro-safe-driver-prediction/data

path='/home/rodolfo/Insync/rodolfopcruz2@gmail.com/Google Drive/Estudo/Python_Projects/datasets/porto-seguro-safe-driver-prediction/'
train=pd.read_csv(path+'train.csv')
test= pd.read_csv(path+'test.csv')

# Pipelines

- Serão criadas duas pipelines de dados:

    - Em uma delas serão tratadas todas as colunas, a seleção das features será feita mais a frente;
    - Na outra a primeira etapa consistirá na remoção de algumas festures, de acordo com os resultados obtidos com a correlação.

In [5]:
cat_features=[feature for feature in train.columns if 'cat' in feature]
bin_features=[feature for feature in train.columns if 'bin' in feature]
num_features=[feature for feature in train.columns if 'cat' not in feature 
                                                    and 'bin' not in feature and
                                                    feature!='target' and feature!='id']


## Pipeline com todas as features

In [19]:
#Nos dados os missing values foram substituídos pelo valor -1
#Função para trocar -1 por np .nan

def impute_nan_missing_value(x):
    x=x.replace(-1,np.nan)
    return x

impute_nan_missing_value_transformer=FunctionTransformer(impute_nan_missing_value)

In [20]:
#Função para converter data type para object

def converter_object(x):
    x=x.astype('object')
    return x

converter_object_transformer=FunctionTransformer(converter_object)

### Features Binárias

In [24]:
imputer_binary=SimpleImputer(strategy='most_frequent')

Pipeline para features binárias:

1) Substituir -1 por np.nan;
2) Converter os valores para object;
3) Substituir valores ausentes pelo mais frequente.

In [25]:
binary_pipeline=Pipeline([('replace_-1_nan',impute_nan_missing_value_transformer),
                          ('convert_to_object',converter_object_transformer),
                          ('fill_missing_binary',imputer_binary)])

In [26]:
teste=binary_pipeline.fit_transform(train[bin_features])

### Features Numéricas

In [29]:
missing_threshold=1/100

#numeric features com muitos mussing values
num_features_muitos_na=train[num_features].isna().mean()>missing_threshold
num_features_muitos_na=num_features_muitos_na[num_features_muitos_na].index.to_list()

#numeric features com poucos missing values
num_features_poucos_na=train[num_features].isna().mean()<missing_threshold
num_features_poucos_na=num_features_poucos_na[num_features_poucos_na].index.to_list()

In [41]:
#Criar nova coluna para indicar se existe valor ausente nas colunas numéricas com muitos na
missing_indicator=AddMissingIndicator(variables=num_features_muitos_na)

#Substituir os valores ausentes pela média
num_imputer=SimpleImputer(strategy='mean')


In [43]:
#Aplicar transformação de Yeo Jhonson as seguintes features

features_to_be_transformed=['ps_reg_03',
                            'ps_car_12',
                            'ps_car_13',
                            'ps_car_14',
                            'ps_car_15',
                            'ps_reg_02']

yeo_transformer=YeoJohnsonTransformer(variables=features_to_be_transformed)

In [39]:
# Scaling

scaler = StandardScaler()

In [44]:
num_features_pipeline=Pipeline([
            ('replace_-1_nan',impute_nan_missing_value_transformer),
            ('addin_missing_indicator',missing_indicator),
            ('imputer_mean',SklearnTransformerWrapper(transformer=num_imputer,variables=num_features)),
            ('ajustar_ditribuicao',yeo_transformer),
            ('standard_scaler',SklearnTransformerWrapper(transformer=scaler,variables=num_features))])

In [47]:
x=num_features_pipeline.fit_transform(train[num_features])

In [46]:
np.shape(train[num_features])

(595212, 26)

In [50]:
x.columns

Index(['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13',
       'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03',
       'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08',
       'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13',
       'ps_calc_14', 'ps_reg_03_na', 'ps_car_14_na'],
      dtype='object')