In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import random as python_random
import joblib

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import tensorflow as tf

from utils import *



In [2]:
seed = 41
np.random.seed(seed)
python_random.seed(seed)
tf.random.set_seed(seed)

### Ler dados brutos do bd

In [3]:
df = fetch_data_from_db(const.consulta_sql)

In [4]:
df.head()

Unnamed: 0,profissao,tempoprofissao,renda,tiporesidencia,escolaridade,score,idade,dependentes,estadocivil,produto,valorsolicitado,valortotalbem,classe
0,Cientista de Dados,24,58660.0,Outros,Ens.Médio,MuitoBom,57.0,0,Solteiro,VoyageRoamer,84623.0,350000.0,bom
1,Empresário,21,46557.0,Outros,Ens.Médio,MuitoBom,36.0,2,Víuvo,EcoPrestige,126855.0,500000.0,bom
2,Dentista,13,43939.0,Própria,Ens.Médio,Bom,22.0,0,Casado,DoubleDuty,127151.0,320000.0,ruim
3,Engenheiro,10,37262.0,Própria,Superior,Baixo,34.0,0,Divorciado,AgileXplorer,28767.0,250000.0,bom
4,Contador,6,52606.0,Própria,PósouMais,Justo,26.0,0,Casado,TrailConqueror,199564.0,400000.0,ruim


### Conversão de Tipos

In [5]:
df['idade'] = df['idade'].astype(int)
df['valorsolicitado'] = df['valorsolicitado'].astype(float)
df['valortotalbem'] = df['valortotalbem'].astype(float)

### Tratamento de Nulos

In [None]:
substitui_nulos(df)

In [7]:
df.isnull().values.any()

False

### Trata Erros de Digitação

In [8]:
# aplicar para profissao
profissoes_validas = ['Advogado', 'Arquiteto', 'Cientista de Dados', 'Contador','Dentista','Empresário',
                 'Engenheiro','Médico','Programador']
corrigir_erros_digitacao(df, 'profissao', profissoes_validas)

In [9]:
df['profissao'].unique()

array(['Cientista de Dados', 'Empresário', 'Dentista', 'Engenheiro',
       'Contador', 'Arquiteto', 'Programador', 'Advogado', 'Médico'],
      dtype=object)

### Trata Outliers

In [10]:
df = tratar_outliers(df, 'tempoprofissao', 0, 70)
df = tratar_outliers(df, 'idade', 0, 110)

In [11]:
df.describe()

Unnamed: 0,tempoprofissao,renda,idade,dependentes,valorsolicitado,valortotalbem
count,150.0,150.0,150.0,150.0,150.0,150.0
mean,22.9,36406.813333,45.82,0.98,144512.68,375161.993333
std,11.114867,12974.282533,13.776777,0.993063,113913.175165,178933.034924
min,0.0,7814.0,21.0,0.0,28290.0,31170.0
25%,13.0,24271.75,35.0,0.0,69172.0,280000.0
50%,24.0,35795.0,46.0,1.0,123258.0,320000.0
75%,32.0,46361.0,57.0,2.0,170513.75,400000.0
max,40.0,59976.0,70.0,4.0,800000.0,800000.0


### Feature Engineering: Criar Novos Atributos

In [12]:
# Feature Engineering: criar novos atributos
df['proporcaosolicitadototal'] = df['valorsolicitado'] / df['valortotalbem']
df['proporcaosolicitadototal'] = df['proporcaosolicitadototal'].astype(float)

### Dividindo Dados

In [13]:
# Preparando os dados de entrada e saída, incluindo o novo atributo
X = df.drop('classe', axis=1)
y = df['classe']

# Dividindo os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

In [14]:
X_train

Unnamed: 0,profissao,tempoprofissao,renda,tiporesidencia,escolaridade,score,idade,dependentes,estadocivil,produto,valorsolicitado,valortotalbem,proporcaosolicitadototal
79,Cientista de Dados,40.0,23561.0,Outros,PósouMais,MuitoBom,43.0,0,Víuvo,VoyageRoamer,143697.0,350000.0,0.410563
54,Engenheiro,19.0,37568.0,Alugada,Ens.Fundamental,MuitoBom,69.0,0,Divorciado,AgileXplorer,84435.0,250000.0,0.337740
106,Programador,34.0,28792.0,Própria,Ens.Médio,MuitoBom,62.0,2,Víuvo,AgileXplorer,49694.0,250000.0,0.198776
90,Engenheiro,31.0,31284.0,Alugada,Ens.Fundamental,MuitoBom,46.0,1,Solteiro,ElegantCruise,42544.0,300000.0,0.141813
145,Médico,36.0,47480.0,Própria,Superior,Bom,63.0,0,Divorciado,SpeedFury,217011.0,800000.0,0.271264
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,Engenheiro,5.0,21685.0,Alugada,Ens.Fundamental,MuitoBom,52.0,0,Divorciado,SpeedFury,162661.0,800000.0,0.203326
89,Programador,24.0,19999.0,Outros,Ens.Médio,MuitoBom,33.0,1,Solteiro,DoubleDuty,320000.0,33471.0,9.560515
65,Dentista,22.0,31837.0,Outros,Superior,MuitoBom,21.0,0,Víuvo,WorkMaster,87972.0,280000.0,0.314186
80,Arquiteto,26.0,31394.0,Própria,PósouMais,MuitoBom,52.0,1,Divorciado,ElegantCruise,107035.0,300000.0,0.356783


In [15]:
y_train

79     ruim
54     ruim
106     bom
90      bom
145     bom
       ... 
26      bom
89     ruim
65     ruim
80     ruim
140    ruim
Name: classe, Length: 120, dtype: object

### Normalização

In [16]:
X_test = save_scalers(X_test, ['tempoprofissao','renda','idade','dependentes','valorsolicitado','valortotalbem','proporcaosolicitadototal'])
X_train = save_scalers(X_train, ['tempoprofissao','renda','idade','dependentes','valorsolicitado','valortotalbem','proporcaosolicitadototal'])

In [17]:
X_test.head()

Unnamed: 0,profissao,tempoprofissao,renda,tiporesidencia,escolaridade,score,idade,dependentes,estadocivil,produto,valorsolicitado,valortotalbem,proporcaosolicitadototal
119,Cientista de Dados,1.185897,1.825807,Própria,Ens.Fundamental,Baixo,0.220071,-0.166206,Casado,SpeedFury,0.91949,3.461002,-0.264215
128,Dentista,0.847874,-0.254147,Alugada,Superior,Justo,0.783667,0.831028,Víuvo,SpeedFury,4.597073,-0.767453,3.556824
135,Médico,-0.757734,-0.769935,Outros,Ens.Fundamental,MuitoBom,1.186236,-0.166206,Divorciado,TrailConqueror,-0.613239,0.412066,-0.525845
91,Programador,-0.588723,0.571378,Própria,PósouMais,Baixo,0.783667,0.831028,Víuvo,EcoPrestige,0.153154,1.1743,-0.278261
112,Programador,1.185897,-0.721346,Outros,Superior,Bom,-0.182498,-1.163439,Casado,DoubleDuty,-0.555359,-0.197721,-0.444277


### Codificação

In [18]:
mapeamento = {'ruim': 0, 'bom': 1}
# Aplica o mapeamento
y_train = np.array([mapeamento[item] for item in y_train])
y_test = np.array([mapeamento[item] for item in y_test])

In [19]:
y_train

array([0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

In [20]:
X_train = save_encoders(X_train, ['profissao', 'tiporesidencia', 'escolaridade','score','estadocivil','produto'])
X_test = save_encoders(X_test, ['profissao', 'tiporesidencia', 'escolaridade','score','estadocivil','produto'])

In [21]:
X_train

Unnamed: 0,profissao,tempoprofissao,renda,tiporesidencia,escolaridade,score,idade,dependentes,estadocivil,produto,valorsolicitado,valortotalbem,proporcaosolicitadototal
79,2,1.628292,-1.007150,1,2,3,-0.210769,-0.951479,3,6,-0.004945,-0.173090,-0.176873
54,6,-0.312718,0.062261,0,0,3,1.641623,-0.951479,1,0,-0.568777,-0.706208,-0.226440
106,8,1.073718,-0.607772,2,1,3,1.142902,1.087404,3,0,-0.899311,-0.706208,-0.321025
90,6,0.796431,-0.417512,0,0,3,0.002969,0.067963,2,3,-0.967338,-0.439649,-0.359797
145,7,1.258576,0.819025,2,3,1,1.214148,-0.951479,1,4,0.692582,2.225939,-0.271687
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,6,-1.606725,-1.150379,0,0,3,0.430444,-0.951479,1,4,0.175483,2.225939,-0.317928
89,8,0.149427,-1.279103,1,1,3,-0.923228,0.067963,2,1,1.672443,-1.860562,6.051030
65,4,-0.035431,-0.375291,1,3,3,-1.778178,-0.951479,3,7,-0.535125,-0.546273,-0.242472
80,1,0.334285,-0.409114,2,2,3,0.430444,0.067963,1,3,-0.353756,-0.439649,-0.213478


### Seleção de Atributos

In [22]:
# Instancia o modelo que você deseja usar
model = RandomForestClassifier()

# Instancia o RFE
selector = RFE(model, n_features_to_select=10, step=1)
selector = selector.fit(X_train, y_train)

# Transforma os dados
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)


joblib.dump(selector, 'selector.joblib')

['selector.joblib']

In [23]:
print(selector.support_)
print(selector.ranking_)

[ True  True  True False False  True  True False  True  True  True  True
  True]
[1 1 1 4 2 1 1 3 1 1 1 1 1]
