In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler

from pycaret.datasets import get_data
from pycaret.classification import setup, compare_models, create_model, tune_model, evaluate_model, interpret_model, predict_model, save_model, load_model, plot_model, pull, save_model

### Carregando a base de dados

In [2]:
df = pd.read_feather('credit_scoring.ftr')
df.head()

Unnamed: 0,data_ref,index,sexo,posse_de_veiculo,posse_de_imovel,qtd_filhos,tipo_renda,educacao,estado_civil,tipo_residencia,idade,tempo_emprego,qt_pessoas_residencia,renda,mau
0,2015-01-01,5733,F,N,N,0,Empresário,Médio,Solteiro,Casa,43,6.873973,1.0,2515.39,False
1,2015-01-01,727,F,S,S,0,Assalariado,Médio,Casado,Casa,35,4.526027,2.0,3180.19,False
2,2015-01-01,6374,F,N,N,2,Assalariado,Médio,Casado,Casa,31,0.243836,4.0,1582.29,False
3,2015-01-01,9566,F,N,N,0,Assalariado,Médio,Casado,Casa,54,12.772603,2.0,13721.17,False
4,2015-01-01,9502,F,S,N,0,Assalariado,Superior incompleto,Solteiro,Casa,31,8.432877,1.0,2891.08,False


### Separando os tipos de variáveis

In [3]:
# Criando uma lista para cada tipo de variável
qualitativas = ['sexo', 'posse_de_veiculo', 'posse_de_imovel', 'qtd_filhos', 'tipo_renda', 'educacao',
               'estado_civil', 'tipo_residencia', 'qt_pessoas_residencia']
quantitativas = ['idade', 'tempo_emprego', 'renda']

### Funções para o pipeline

In [4]:
# Função para remover colunas indesejadas
def remover_colunas(df):
    return df.drop(columns=['index', 'data_ref'])

# Função para balancear quantidade de adimplentes e inadimplentes
def balanceamento(df): # Recebe um DataFrame
    df_1 = df[df['mau'] == 1] # Define apenas inadimplentes
    n = df_1.shape[0] * 3 # Pega a quantidade de inadimplentes e multiplica por 3 para usar nos adimplentes
    df_0 = df[df['mau'] == 0].sample(n=n, random_state=0) # Define adimplentes 3x mais que inadimplentes
    return pd.concat([df_1, df_0]).sample(frac=1, random_state=0).reset_index(drop=True) # Junta adimplentes e inadimplentes 75% e 25%
# o frac=1 é para embaralhar os dados e não inviesar o modelo

# Função para preenchimento de valores ausentes pela média
def preencher_missings(df): # Recebe um DataFrame
    df['tempo_emprego'].fillna(df['tempo_emprego'].mean(), inplace=True) # Preenche missing com médias
    return df

def remove_outliers_renda(df, z_thresh=3):
    col = 'renda'
    col_zscore = (df[col] - df[col].mean()) / df[col].std()
    df = df[(col_zscore.abs() <= z_thresh)]
    return df

# Função para agrupar categorias
def agrupar_categorias(df):
    df['qt_pessoas_residencia'] = df['qt_pessoas_residencia'].astype(int)
    df['qt_pessoas_residencia'].replace({6: '6+', 9: '6+', 15: '6+', 7: '6+'}, inplace=True)
    df['qtd_filhos'].replace({5: '5+', 7: '5+', 14: '5+'}, inplace=True)
    df['tipo_renda'].replace({'Bolsista': 'Assalariado'}, inplace=True)
    df['educacao'].replace({'Fundamental': 'Básico', 'Médio': 'Básico', 
                            'Superior incompleto': 'Avançado', 
                            'Superior completo': 'Avançado',
                            'Pós graduação': 'Avançado'}, inplace=True)
    df['tipo_residencia'].replace({'Estúdio': 'Outros', 'Comunitário': 'Outros', 
                                   'Governamental': 'Outros'}, inplace=True)
    return df


def one_hot_encoding(df):
    df = pd.get_dummies(df, columns=qualitativas, drop_first=True)
    return df


def min_max_scaler(df):
    scaler = MinMaxScaler()
    df[quantitativas] = scaler.fit_transform(df[quantitativas])
    return df
    


### Criando pipeline com as funções anteriores

In [5]:
pipeline = Pipeline(steps=[
    ('remover_colunas', FunctionTransformer(remover_colunas)), 
    ('balanceamento', FunctionTransformer(balanceamento)), 
    ('preencher_missings', FunctionTransformer(preencher_missings)),
    ('remove_outliers_renda', FunctionTransformer(remove_outliers_renda)),
    ('agrupar_categorias', FunctionTransformer(agrupar_categorias)),
    ('one_hot_encoding', FunctionTransformer(one_hot_encoding)),
    ('min_max_scaler', FunctionTransformer(min_max_scaler)),
])

### Transformando o DF com o pipeline

In [6]:
# Aplicando o pipeline
df_transformed = pipeline.fit_transform(df)
df_transformed.head()

Unnamed: 0,idade,tempo_emprego,renda,mau,sexo_M,posse_de_veiculo_S,posse_de_imovel_S,qtd_filhos_1,qtd_filhos_2,qtd_filhos_3,...,estado_civil_União,estado_civil_Viúvo,tipo_residencia_Casa,tipo_residencia_Com os pais,tipo_residencia_Outros,qt_pessoas_residencia_2,qt_pessoas_residencia_3,qt_pessoas_residencia_4,qt_pessoas_residencia_5,qt_pessoas_residencia_6+
0,0.304348,0.155206,0.042897,False,1,1,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,0.282609,0.215136,0.018997,False,1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0.065217,0.109361,0.018048,False,0,0,1,0,0,0,...,1,0,1,0,0,1,0,0,0,0
3,0.217391,0.073505,0.022863,False,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0.108696,0.127609,0.011494,False,1,1,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0


### Verificando qual o modelo ideal com foco em recall

In [7]:
# Configurações iniciais dos classificadores
clf = setup(data=df_transformed, 
            target='mau', 
            session_id=123, # Salvando estado
            fold_strategy='stratifiedkfold', 
            fold=5, # Definindo quantidade de Folds
            use_gpu=True) # Usa GPU para ajudar o processamento


# Comparar modelos
best_model = compare_models(sort='Recall')
best_model

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...


[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...


[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Description,Value
0,Session id,123
1,Target,mau
2,Target type,Binary
3,Original data shape,"(232133, 28)"
4,Transformed data shape,"(232133, 28)"
5,Transformed train set shape,"(162493, 28)"
6,Transformed test set shape,"(69640, 28)"
7,Numeric features,27
8,Preprocess,True
9,Imputation type,simple


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Ellesmere, Vendor: Advanced Micro Devices, Inc.
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...


[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.5809,0.6755,0.7651,0.3498,0.4797,0.2038,0.2485,0.298
qda,Quadratic Discriminant Analysis,0.5581,0.6762,0.48,0.3269,0.2824,0.0596,0.0883,0.374
rf,Random Forest Classifier,0.7209,0.6963,0.3756,0.4387,0.4047,0.2239,0.225,5.338
knn,K Neighbors Classifier,0.7429,0.6923,0.3442,0.4875,0.4035,0.2458,0.2519,10.992
et,Extra Trees Classifier,0.7161,0.6332,0.3442,0.4237,0.3798,0.1983,0.2001,5.774
dt,Decision Tree Classifier,0.7104,0.6175,0.3345,0.41,0.3684,0.183,0.1846,1.556
lightgbm,Light Gradient Boosting Machine,0.7783,0.7783,0.3104,0.6227,0.4143,0.296,0.3233,1.44
gbc,Gradient Boosting Classifier,0.7786,0.7791,0.3066,0.6262,0.4116,0.2945,0.323,17.136
ada,Ada Boost Classifier,0.7782,0.778,0.2805,0.6398,0.3894,0.2785,0.3141,4.828
lr,Logistic Regression,0.7485,0.7707,0.0063,0.7599,0.0125,0.0084,0.0525,0.784


### Tunning de hyperparametros
<br>

#### Foi escolhido: <br>
#### * Naive Bayes: Alto Recall e F1Score
#### * Random Forest Classifier: Terceiro melhor Recall, mantendo uma boa taixa nas demais medidas
#### * Gradient Boosting Classifier: Melhor Accuracy e AUC, bom F1Score

#### Tentei otimizar o NB com Recall, Precision e F1, o Recall e o F1 quase que não mudou o resultado e o Precision fez com que o Recall cair muito, então no tunning do NB não mudou nada.

In [8]:
# Ajustando Hiperparâmetros do modelo Naive Bayes
nb_model = create_model('nb')
tuned_nb = tune_model(nb_model, optimize='F1')
evaluate_model(tuned_nb)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5772,0.6787,0.7954,0.3512,0.4873,0.2107,0.2625
1,0.5763,0.6717,0.7679,0.347,0.478,0.1994,0.2446
2,0.5539,0.6783,0.807,0.339,0.4775,0.189,0.2443
3,0.6027,0.6779,0.7275,0.3587,0.4805,0.2149,0.2503
4,0.5943,0.6709,0.7275,0.353,0.4753,0.2049,0.2408
Mean,0.5809,0.6755,0.7651,0.3498,0.4797,0.2038,0.2485
Std,0.0169,0.0035,0.0332,0.0066,0.0041,0.0091,0.0076


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5772,0.6787,0.7954,0.3512,0.4873,0.2107,0.2625
1,0.5763,0.6717,0.7679,0.347,0.478,0.1994,0.2446
2,0.5539,0.6783,0.807,0.339,0.4775,0.189,0.2443
3,0.6027,0.6779,0.7275,0.3587,0.4805,0.2149,0.2503
4,0.5943,0.6709,0.7275,0.353,0.4753,0.2049,0.2408
Mean,0.5809,0.6755,0.7651,0.3498,0.4797,0.2038,0.2485
Std,0.0169,0.0035,0.0332,0.0066,0.0041,0.0091,0.0076


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [9]:
# Fazer previsões usando o modelo ajustado
predictions = predict_model(tuned_nb)
# Extraindo valores reais e previstos
y_true = predictions['mau']  # Alvo real
y_pred = predictions['prediction_label']  # Predições feitas pelo modelo

cm_nb = confusion_matrix(y_true, y_pred)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.5774,0.673,0.7657,0.3473,0.4779,0.1998,0.2444


#### Tentei otimizar o RF com o Recall, porém o modelo simplesmente classificou todo mundo como True, o que é péssimo. Após tentei otimizar focado em F1 para ver se melhora o Recall mantendo o Precision

In [10]:
# Ajustando Hiperparâmetros do modelo Random Forest
rf_model = create_model('rf')
tuned_rf = tune_model(rf_model, optimize='F1')
evaluate_model(tuned_rf)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7186,0.6956,0.375,0.4339,0.4023,0.2195,0.2205
1,0.7215,0.6955,0.3776,0.4402,0.4065,0.226,0.2271
2,0.7193,0.696,0.3747,0.4353,0.4027,0.2206,0.2216
3,0.723,0.6968,0.3738,0.4426,0.4053,0.2264,0.2278
4,0.7223,0.6973,0.3771,0.4416,0.4068,0.227,0.2282
Mean,0.7209,0.6963,0.3756,0.4387,0.4047,0.2239,0.225
Std,0.0017,0.0007,0.0015,0.0035,0.0019,0.0032,0.0033


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7071,0.7611,0.71,0.4494,0.5505,0.3491,0.3692
1,0.6962,0.7618,0.7292,0.439,0.548,0.3399,0.3647
2,0.7058,0.7633,0.7091,0.448,0.5491,0.3469,0.3671
3,0.6975,0.7641,0.7361,0.4408,0.5514,0.3442,0.37
4,0.7028,0.7568,0.71,0.4447,0.5469,0.3427,0.3635
Mean,0.7019,0.7614,0.7189,0.4444,0.5492,0.3446,0.3669
Std,0.0044,0.0025,0.0114,0.004,0.0016,0.0032,0.0025


Fitting 5 folds for each of 10 candidates, totalling 50 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
# Fazer previsões usando o modelo ajustado
predictions = predict_model(tuned_rf)
# Extraindo valores reais e previstos
y_true = predictions['mau']  # Alvo real
y_pred = predictions['prediction_label']  # Predições feitas pelo modelo

cm_rf = confusion_matrix(y_true, y_pred)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.6935,0.7597,0.7293,0.4362,0.5459,0.336,0.3613


In [12]:
# Ajustando Hiperparâmetros do modelo Gradient Boosting Classifier
gbc_model = create_model('gbc')
tuned_gbc = tune_model(gbc_model, optimize='Recall')
evaluate_model(tuned_gbc)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.777,0.7779,0.304,0.6194,0.4078,0.2897,0.3176
1,0.7805,0.7797,0.308,0.6352,0.4148,0.2992,0.3289
2,0.7817,0.7813,0.3189,0.635,0.4246,0.3076,0.3355
3,0.7792,0.7805,0.3062,0.6294,0.4119,0.2955,0.3246
4,0.7748,0.776,0.2959,0.6118,0.3989,0.2804,0.3085
Mean,0.7786,0.7791,0.3066,0.6262,0.4116,0.2945,0.323
Std,0.0025,0.0019,0.0074,0.0092,0.0084,0.0091,0.0093


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7656,0.7595,0.3265,0.562,0.413,0.2792,0.2953
1,0.7669,0.7594,0.3268,0.5669,0.4146,0.282,0.2986
2,0.7675,0.7618,0.3338,0.5678,0.4204,0.2871,0.3029
3,0.7683,0.7608,0.3252,0.5726,0.4148,0.2838,0.3014
4,0.7645,0.7581,0.313,0.5605,0.4017,0.2695,0.2872
Mean,0.7666,0.7599,0.3251,0.566,0.4129,0.2803,0.2971
Std,0.0013,0.0013,0.0067,0.0043,0.0061,0.006,0.0056


Fitting 5 folds for each of 10 candidates, totalling 50 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [13]:
# Fazer previsões usando o modelo ajustado
predictions = predict_model(tuned_gbc)
# Extraindo valores reais e previstos
y_true = predictions['mau']  # Alvo real
y_pred = predictions['prediction_label']  # Predições feitas pelo modelo

cm_gbc = confusion_matrix(y_true, y_pred)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7695,0.761,0.3233,0.5781,0.4147,0.2852,0.3038


### Calculando lucro de ganho e perda para identificar qual a melhor métrica e modelo

#### Obs: Não foi incluido o True Negative, porque está evitando uma perda, mas não está perdendo ou ganhando nada.

In [14]:
# Definir os valores de ganho e perda
ganho_TN = 1   # Ganho por True Negative (adimplente corretamente identificado)
perda_FP = 1    # Perda por False Positive (adimplente incorretamente identificado como inadimplente)
perda_FN = 10  # Perda por False Negative (inadimplente não identificado)

def calcular_lucro_total(cm, ganho_TN, perda_FP, perda_FN):
    # Extraindo TN, FP, FN da matriz de confusão
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    
    # Calcular lucro total
    lucro = (TN * ganho_TN) - (FN * perda_FN)
    lucro_perdido = FP * perda_FP
    return lucro, lucro_perdido

In [15]:
# Calcular lucro total para cada modelo usando as matrizes de confusão já obtidas
lucro_nb, lucro_perdido_nb = calcular_lucro_total(cm_nb, ganho_TN, perda_FP, perda_FN)
lucro_rf, lucro_perdido_rf = calcular_lucro_total(cm_rf, ganho_TN, perda_FP, perda_FN)
lucro_gbc, lucro_perdido_gbc = calcular_lucro_total(cm_gbc, ganho_TN, perda_FP, perda_FN)

print(f"Lucro total para Naive Bayes: {lucro_nb}")
print(f"Lucro perdido para Naive Bayes (devido a FP): {lucro_perdido_nb} \n")

print(f"Lucro total para Random Forest: {lucro_rf}")
print(f"Lucro perdido para Random Forest (devido a FP): {lucro_perdido_rf} \n")

print(f"Lucro total para Gradient Boosting: {lucro_gbc}")
print(f"Lucro perdido para Gradient Boosting (devido a FP): {lucro_perdido_gbc}")

Lucro total para Naive Bayes: -14479
Lucro perdido para Naive Bayes (devido a FP): 25309 

Lucro total para Random Forest: -12141
Lucro perdido para Random Forest (devido a FP): 16581 

Lucro total para Gradient Boosting: -71139
Lucro perdido para Gradient Boosting (devido a FP): 4149


In [16]:
# Salvar o modelo otimizado
save_model(tuned_rf, 'tuned_rf_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['idade', 'tempo_emprego', 'renda',
                                              'sexo_M', 'posse_de_veiculo_S',
                                              'posse_de_imovel_S',
                                              'qtd_filhos_1', 'qtd_filhos_2',
                                              'qtd_filhos_3', 'qtd_filhos_4',
                                              'qtd_filhos_5+',
                                              'tipo_renda_Empresário',
                                              'tipo_renda_Pensionista',
                                              'tipo_renda_Servidor público',
                                              'educac...
                  RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                         class_weight='balanced_subsample',

In [17]:
import sys
print(sys.executable)


C:\Users\Bill_\anaconda3\python.exe
