# Encoding

Nessa etapa vamos realizar o encoding das variáveis não numéricas, para conseguir passar os dados pelo modelo. Como vamos utilizar um modelo baseado em árvores, vamos utilizar um ordinal encoder, visto que o modelo consegue capturar não linearidade.  
Além dos encoders vamos criar todos os artefatos binários necessários para montar a pipeline do modelo.    


In [20]:
import pandas as pd
from feature_engine.encoding import OrdinalEncoder
import yaml
import pickle

import sys
sys.path.append('../')
from src.utils.transformers import Selector, FeatureEngineering, FixFeaturesMissing, FixFeaturesType

# Data

In [3]:
df = pd.read_parquet('../data/enrich/fraud_dataset_v2_train.parquet.gzip')

In [5]:
support_features = ['o','f','n','l','a','m','s','br_qty_cat_last7d',
                    'd','br_vl_cat_last7d','qty_fraud_cat_last7d','h',
                    'vl_fraud_cat_last7d']

In [16]:
config_features = yaml.safe_load(open('../src/data/config/features.yml', 'r'))

In [17]:
config_features

{'temporal_feature': 'fecha',
 'target_feature': 'fraude',
 'fix_type_map': {'datetime[64]': ['fecha']},
 'fix_missing_map': {'d': -2, 'g': 'NULO', 'o': 'NULO', 'q': -1},
 'fix_missing_numeric_features': ['b', 'c', 'f', 'l', 'm'],
 'hard_remove_features': ['c',
  'p',
  'q',
  'fecha',
  'i',
  'score',
  'fraude',
  'ymd']}

# Encoding

In [6]:
ord_enc = OrdinalEncoder(
    encoding_method='arbitrary',
    variables=df[support_features].select_dtypes(include=['object', 'string']).columns.tolist()
).fit(df)

In [18]:
ord_enc.transform(df).head(2)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,fecha,monto,score,fraude,ymd,day_of_week,hour_of_day,period_of_day,vl_cat_last7d,qty_cat_last7d,vl_fraud_cat_last7d,qty_fraud_cat_last7d,br_vl_cat_last7d,br_qty_cat_last7d
0,4,0.7518,42638.24,50.0,0.19021,9.0,BR,20,Mochila Impermeable Mujer Importada Premiun,cat_0cd53cb,0.364921,4338.0,366.0,1,0,Y,0.98,260445,100,2020-03-09 14:33:56,22.18,74.0,0,20200309,0,14,afternoon,964.89,26.0,0.0,0.0,0.0,0.0
1,4,0.7576,34602.36,4.0,0.317697,33.0,BR,7,Placa De Rede ( Pci-e / Pci Express X1 / Mini ...,cat_41464e4,0.056129,4605.0,297.0,1,0,Y,0.06,34602,100,2020-03-16 15:07:38,10.95,28.04,0,20200316,0,15,afternoon,413.53,20.0,4.45,1.0,0.010761,0.05


In [19]:
selector = Selector(support_features).fit(df)
fix_type = FixFeaturesType(config_features['fix_type_map']).fit(df)
fix_missing = FixFeaturesMissing(config_features['fix_missing_numeric_features'], 
                                 config_features['fix_missing_map']).fit(df)
feature_engineering = FeatureEngineering().fit(df)

In [21]:
pickle.dump(ord_enc, open('../model/encoders/ordinal_encoder.pkl', 'wb'))
pickle.dump(selector, open('../model/encoders/selector.pkl', 'wb'))
pickle.dump(fix_type, open('../model/encoders/fix_type.pkl', 'wb'))
pickle.dump(fix_missing, open('../model/encoders/fix_missing.pkl', 'wb'))
pickle.dump(feature_engineering, open('../model/encoders/feature_engineering.pkl', 'wb'))