In [20]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss , ClusterCentroids
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score ,make_scorer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, StratifiedKFold
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
dados_limpos = pd.read_csv('/content/dados_apos_limpeza.csv')
dados_limpos.head()

Unnamed: 0,idade_pessoa,renda_pessoa,propriedade_casa_pessoa,tempo_emprego_pessoa,intencao_emprestimo,grau_emprestimo,valor_emprestimo,taxa_juros_emprestimo,status_emprestimo,percentual_renda_emprestimo,inadimplente_pessoa,grau_emprestimo_encoded
0,26.0,84996.0,Rent,0.0,Medical,B,9000.0,11.26,0.0,0.11,N,1.0
1,24.0,55782.0,Rent,3.0,Homeimprovement,A,17000.0,7.51,0.0,0.3,N,0.0
2,25.0,125000.0,Own,9.0,Personal,B,25000.0,10.59,0.0,0.2,N,1.0
3,23.0,44340.0,Own,0.0,Venture,C,3000.0,12.98,0.0,0.07,Y,2.0
4,23.0,48000.0,Mortgage,5.0,Education,C,5000.0,13.99,0.0,0.1,Y,2.0


In [14]:
# nomalizar colunas numericas

# Selecionar apenas as colunas numéricas
colunas_numericas = dados_limpos.select_dtypes(include=np.number)

# Criar um objeto MinMaxScaler
scaler = MinMaxScaler()

# Ajustar e transformar os dados numéricos
dados_normalizados = pd.DataFrame(scaler.fit_transform(colunas_numericas), columns=colunas_numericas.columns)

# Substituir as colunas originais pelas colunas normalizadas no DataFrame principal
dados_limpos[colunas_numericas.columns] = dados_normalizados


In [15]:
# Selecionar as colunas do tipo objeto
colunas_objeto = dados_limpos.select_dtypes(include=['object']).columns

# Aplicar o One-Hot Encoding com drop='first'
dados_encoded = pd.get_dummies(dados_limpos, columns=colunas_objeto, drop_first=True)

dados_encoded = dados_encoded.replace({True: 1, False: 0})

# Visualizar o DataFrame com as colunas codificadas
dados_encoded.head()

Unnamed: 0,idade_pessoa,renda_pessoa,tempo_emprego_pessoa,valor_emprestimo,taxa_juros_emprestimo,status_emprestimo,percentual_renda_emprestimo,grau_emprestimo_encoded,propriedade_casa_pessoa_Other,propriedade_casa_pessoa_Own,...,intencao_emprestimo_Medical,intencao_emprestimo_Personal,intencao_emprestimo_Venture,grau_emprestimo_B,grau_emprestimo_C,grau_emprestimo_D,grau_emprestimo_E,grau_emprestimo_F,grau_emprestimo_G,inadimplente_pessoa_Y
0,0.081081,0.013508,0.0,0.246377,0.32809,0.0,0.13253,0.166667,0,0,...,1,0,0,1,0,0,0,0,0,0
1,0.054054,0.008636,0.073171,0.478261,0.117416,0.0,0.361446,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.067568,0.02018,0.219512,0.710145,0.290449,0.0,0.240964,0.166667,0,1,...,0,1,0,1,0,0,0,0,0,0
3,0.040541,0.006728,0.0,0.072464,0.424719,0.0,0.084337,0.333333,0,1,...,0,0,1,0,1,0,0,0,0,1
4,0.040541,0.007338,0.121951,0.130435,0.481461,0.0,0.120482,0.333333,0,0,...,0,0,0,0,1,0,0,0,0,1


In [16]:
#train test split

# Separar as variáveis independentes (X) e a variável dependente (y)
X = dados_encoded.drop('inadimplente_pessoa_Y', axis=1)  # Remova a coluna 'inadimplente_pessoa_Y'
y = dados_encoded['inadimplente_pessoa_Y']

# Dividir os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
# Aplicar SMOTE para oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Aplicar NearMiss para undersampling
nm = ClusterCentroids(random_state=42)
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)


In [18]:
# Função para avaliar o modelo
def evaluate_model(y_test, y_pred, model_name, method_name):
    print(f"\n{model_name} - {method_name}")
    print("Precisão:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-score:", f1_score(y_test, y_pred))
    print("Acurácia:", accuracy_score(y_test, y_pred))
# Modelos
logistic_model = LogisticRegression(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)



# Regressão Logística com SMOTE
logistic_model.fit(X_train_smote, y_train_smote)
y_pred_log_smote = logistic_model.predict(X_test)
evaluate_model(y_test, y_pred_log_smote, "Regressão Logística", "SMOTE")

# Regressão Logística com NearMiss
logistic_model.fit(X_train_nm, y_train_nm)
y_pred_log_nm = logistic_model.predict(X_test)
evaluate_model(y_test, y_pred_log_nm, "Regressão Logística", "NearMiss")

# Random Forest com SMOTE
random_forest_model.fit(X_train_smote, y_train_smote)
y_pred_rf_smote = random_forest_model.predict(X_test)
evaluate_model(y_test, y_pred_rf_smote, "Random Forest", "SMOTE")

# Random Forest com NearMiss
random_forest_model.fit(X_train_nm, y_train_nm)
y_pred_rf_nm = random_forest_model.predict(X_test)
evaluate_model(y_test, y_pred_rf_nm, "Random Forest", "NearMiss")

# XGBoost com SMOTE
xgb_model.fit(X_train_smote, y_train_smote)
y_pred_xgb_smote = xgb_model.predict(X_test)
evaluate_model(y_test, y_pred_xgb_smote, "XGBoost", "SMOTE")


# XGBoost com NearMiss
xgb_model.fit(X_train_nm, y_train_nm)
y_pred_xgb_nm = xgb_model.predict(X_test)
evaluate_model(y_test, y_pred_xgb_nm, "XGBoost", "NearMiss")


Regressão Logística - SMOTE
Precisão: 0.5172724541904475
Recall: 1.0
F1-score: 0.6818451791724411
Acurácia: 0.8310555088309504

Regressão Logística - NearMiss
Precisão: 0.5141788719227174
Recall: 0.9581881533101045
F1-score: 0.6692354491989454
Acurácia: 0.8285323801513877

Random Forest - SMOTE
Precisão: 0.5221436546334202
Recall: 0.8147502903600464
F1-score: 0.6364254933091404
Acurácia: 0.8314760302775441

Random Forest - NearMiss
Precisão: 0.5160175879396985
Recall: 0.9541231126596981
F1-score: 0.669792091316755
Acurácia: 0.8296888141295206

XGBoost - SMOTE
Precisão: 0.5151374010246856
Recall: 0.6422764227642277
F1-score: 0.5717239596795037
Acurácia: 0.8257989907485281

XGBoost - NearMiss
Precisão: 0.5116650687120485
Recall: 0.929732868757259
F1-score: 0.6600700886415172
Acurácia: 0.8266400336417157


In [21]:
# Separar as variáveis independentes (X) e a variável dependente (y)
X = dados_encoded.drop('inadimplente_pessoa_Y', axis=1)  # Remova a coluna 'inadimplente_pessoa_Y'
y = dados_encoded['inadimplente_pessoa_Y']

# Configurar a validação cruzada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Aplicar SMOTE para oversampling
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Aplicar NearMiss (ClusterCentroids) para undersampling
nm = ClusterCentroids(random_state=42)
X_nm, y_nm = nm.fit_resample(X, y)

# Modelos
logistic_model = LogisticRegression(random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier(random_state=42)

# Definir métricas para validação cruzada
scoring = {
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'accuracy': make_scorer(accuracy_score)
}

# Regressão Logística com SMOTE
logistic_scores_smote = cross_validate(logistic_model, X_smote, y_smote, cv=cv, scoring=scoring)
print("F1-score médio - Regressão Logística com SMOTE:", logistic_scores_smote['test_f1'].mean())
print("Precisão média - Regressão Logística com SMOTE:", logistic_scores_smote['test_precision'].mean())
print("Recall médio - Regressão Logística com SMOTE:", logistic_scores_smote['test_recall'].mean())
print("Acurácia média - Regressão Logística com SMOTE:", logistic_scores_smote['test_accuracy'].mean())
print("---------------------------------------")
# Regressão Logística com NearMiss
logistic_scores_nm = cross_validate(logistic_model, X_nm, y_nm, cv=cv, scoring=scoring)
print("F1-score médio - Regressão Logística com NearMiss:", logistic_scores_nm['test_f1'].mean())
print("Precisão média - Regressão Logística com NearMiss:", logistic_scores_nm['test_precision'].mean())
print("Recall médio - Regressão Logística com NearMiss:", logistic_scores_nm['test_recall'].mean())
print("Acurácia média - Regressão Logística com NearMiss:", logistic_scores_nm['test_accuracy'].mean())
print("---------------------------------------")
# Random Forest com SMOTE
rf_scores_smote = cross_validate(random_forest_model, X_smote, y_smote, cv=cv, scoring=scoring)
print("F1-score médio - Random Forest com SMOTE:", rf_scores_smote['test_f1'].mean())
print("Precisão média - Random Forest com SMOTE:", rf_scores_smote['test_precision'].mean())
print("Recall médio - Random Forest com SMOTE:", rf_scores_smote['test_recall'].mean())
print("Acurácia média - Random Forest com SMOTE:", rf_scores_smote['test_accuracy'].mean())
print("---------------------------------------")
# Random Forest com NearMiss
rf_scores_nm = cross_validate(random_forest_model, X_nm, y_nm, cv=cv, scoring=scoring)
print("F1-score médio - Random Forest com NearMiss:", rf_scores_nm['test_f1'].mean())
print("Precisão média - Random Forest com NearMiss:", rf_scores_nm['test_precision'].mean())
print("Recall médio - Random Forest com NearMiss:", rf_scores_nm['test_recall'].mean())
print("Acurácia média - Random Forest com NearMiss:", rf_scores_nm['test_accuracy'].mean())
print("---------------------------------------")
# XGBoost com SMOTE
xgb_scores_smote = cross_validate(xgb_model, X_smote, y_smote, cv=cv, scoring=scoring)
print("F1-score médio - XGBoost com SMOTE:", xgb_scores_smote['test_f1'].mean())
print("Precisão média - XGBoost com SMOTE:", xgb_scores_smote['test_precision'].mean())
print("Recall médio - XGBoost com SMOTE:", xgb_scores_smote['test_recall'].mean())
print("Acurácia média - XGBoost com SMOTE:", xgb_scores_smote['test_accuracy'].mean())
print("---------------------------------------")
# XGBoost com NearMiss
xgb_scores_nm = cross_validate(xgb_model, X_nm, y_nm, cv=cv, scoring=scoring)
print("F1-score médio - XGBoost com NearMiss:", xgb_scores_nm['test_f1'].mean())
print("Precisão média - XGBoost com NearMiss:", xgb_scores_nm['test_precision'].mean())
print("Recall médio - XGBoost com NearMiss:", xgb_scores_nm['test_recall'].mean())
print("Acurácia média - XGBoost com NearMiss:", xgb_scores_nm['test_accuracy'].mean())
print("---------------------------------------")

F1-score médio - Regressão Logística com SMOTE: 0.9040505909391824
Precisão média - Regressão Logística com SMOTE: 0.8255135786169981
Recall médio - Regressão Logística com SMOTE: 0.9991178628524647
Acurácia média - Regressão Logística com SMOTE: 0.8939515163806361
---------------------------------------
F1-score médio - Regressão Logística com NearMiss: 0.8489812322344822
Precisão média - Regressão Logística com NearMiss: 0.7559165129659371
Recall médio - Regressão Logística com NearMiss: 0.9682189626178681
Acurácia média - Regressão Logística com NearMiss: 0.8277697626768019
---------------------------------------
F1-score médio - Random Forest com SMOTE: 0.9099152116515423
Precisão média - Random Forest com SMOTE: 0.8465842102675423
Recall médio - Random Forest com SMOTE: 0.9835077770189485
Acurácia média - Random Forest com SMOTE: 0.9026195006971764
---------------------------------------
F1-score médio - Random Forest com NearMiss: 0.8538504151408031
Precisão média - Random Forest