# Preparação dos Dados

In [1]:
import pandas as pd
import numpy as np
import sys
import yaml
sys.path.append('../src/')

from utils.transformers import FixFeaturesType, FixFeaturesMissing

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Data

In [4]:
df = pd.read_csv('../data/raw/fraud_dataset_v2.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,fecha,monto,score,fraude
0,4,0.7685,94436.24,20.0,0.444828,1.0,BR,5,Máquininha Corta Barba Cabelo Peito Perna Pelo...,cat_8d714cd,0.883598,240.0,102.0,1,,N,0.4,94436,0,2020-03-27 11:51:16,5.64,66.0,0
1,4,0.755,9258.5,1.0,0.0,33.0,BR,0,Avental Descartavel Manga Longa - 50 Un. Tnt ...,cat_64b574b,0.376019,4008.0,0.0,1,Y,N,0.02,9258,0,2020-04-15 19:58:08,124.71,72.0,0
2,4,0.7455,242549.09,3.0,0.0,19.0,AR,23,Bicicleta Mountain Fire Bird Rodado 29 Alumini...,cat_e9110c5,0.516368,1779.0,77.0,1,,N,0.06,242549,0,2020-03-25 18:13:38,339.32,95.0,0
3,4,0.7631,18923.9,50.0,0.482385,18.0,BR,23,Caneta Delineador Carimbo Olho Gatinho Longo 2...,cat_d06e653,0.154036,1704.0,1147.0,1,,Y,0.98,18923,100,2020-04-16 16:03:10,3.54,2.0,0
4,2,0.7315,5728.68,15.0,0.0,1.0,BR,2,Resident Evil Operation Raccoon City Ps3,cat_6c4cfdc,0.855798,1025.0,150.0,1,,N,0.28,5728,0,2020-04-02 10:24:45,3.53,76.0,1


In [35]:
df_info = df.describe(include='all').T
df_info['tipo_coluna'] = df.dtypes
df_info['qtd_unicos'] = df.nunique()
df_info['tipo_final'] = df_info['qtd_unicos'].apply(lambda x: 'categorica' if x <= 70 else 'numerica')
df_info

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,tipo_coluna,qtd_unicos,tipo_final
a,250000.0,,,,3.710736,0.908846,0.0,3.0,4.0,4.0,5.0,int64,6,categorica
b,228526.0,,,,0.727647,0.132455,0.0,0.6778,0.7552,0.8061,1.0,float64,7672,numerica
c,228526.0,,,,257801.060924,840623.502209,0.16,9714.785,43578.31,144911.6,13878743.71,float64,135090,numerica
d,249406.0,,,,21.828208,20.088081,-1.0,3.0,15.0,49.0,51.0,float64,53,categorica
e,250000.0,,,,0.222844,2.650651,0.0,0.0,0.106154,0.283668,833.333333,float64,43208,numerica
f,249985.0,,,,51.288385,681.83091,-5.0,1.0,9.0,33.0,145274.0,float64,1559,numerica
g,249676.0,51.0,BR,185579.0,,,,,,,,object,51,categorica
h,250000.0,,,,14.259116,14.196752,-1.0,3.0,9.0,21.0,59.0,int64,61,categorica
i,250000.0,127804.0,Fone De Ouvido Sem Fio Xiaomi Airdots Preto,137.0,,,,,,,,object,127804,numerica
j,250000.0,8324.0,cat_43b9c10,3697.0,,,,,,,,object,8324,numerica


Avaliando as tabelas acima será necessário vamos realizar alguns ajustes de tipo e nulo.  
Ajustes de Tipo:  
- Ajustar coluna de data 'fecha' para tipo timestamp  

Ajuste de Nulos:  
- Ajustar colunas ['d', 'g', 'o', 'q'], para as colunas categóricas vou criar uma nova categoria indicando o nulo. 
- Ajustar colunas ['b', 'c', 'f', 'l', 'm'], para as colunas numéricas vou realizar uma substituição utilizando a mediana, para isso vou utilizar o conjunto de validação.  

Colunas com até 70 valores únicos foram definidas como colunas do tipo categórico e colunas com mais foram definidas como numéricas.  


# Ajuste de Tipos

In [3]:
config_features = yaml.safe_load(open('../src/data/config/features.yml', 'r'))

In [47]:
fix_features = FixFeaturesType(config_features['fix_type_map'])

In [48]:
df_tmp = fix_features.transform(df)

# Ajuste de Missings

In [49]:
fix_missing = FixFeaturesMissing(config_features['fix_missing_numeric_features'],
                                 config_features['fix_missing_map'])

In [50]:
df_tmp = fix_missing.fit_transform(df_tmp)

In [51]:
df_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 23 columns):
 #   Column  Non-Null Count   Dtype        
---  ------  --------------   -----        
 0   a       250000 non-null  int64        
 1   b       250000 non-null  float64      
 2   c       250000 non-null  float64      
 3   d       250000 non-null  float64      
 4   e       250000 non-null  float64      
 5   f       250000 non-null  float64      
 6   g       250000 non-null  object       
 7   h       250000 non-null  int64        
 8   i       250000 non-null  object       
 9   j       250000 non-null  object       
 10  k       250000 non-null  float64      
 11  l       250000 non-null  float64      
 12  m       250000 non-null  float64      
 13  n       250000 non-null  int64        
 14  o       250000 non-null  object       
 15  p       250000 non-null  object       
 16  q       250000 non-null  float64      
 17  r       250000 non-null  int64        
 18  s   