In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Cargar datos y preprocesamiento
data = pd.read_csv('../parcial1/Clasificacion_banco.csv')

# Convertir variable categórica a dummies
data = pd.get_dummies(data, columns=['purpose'], drop_first=True)

# División de datos en características (X) y objetivo (y)
X = data.drop('not.fully.paid', axis=1)
y = data['not.fully.paid']

# División en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Aplicar escalado a las variables numéricas
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Balanceo de clases con SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Función para entrenamiento y evaluación de modelos
def train_and_evaluate(model, params, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(model, params, scoring='f1', cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Matriz de confusión y reporte
    print("Mejor Modelo:", best_model)
    print("Matriz de Confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nReporte de Clasificación:")
    print(classification_report(y_test, y_pred, target_names=["Pagadores", "No Pagadores"]))

# Parámetros para Random Forest y XGBoost
params_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [10, 20]
}

params_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

print("Random Forest:")
train_and_evaluate(RandomForestClassifier(random_state=42), params_rf, X_train_balanced, y_train_balanced, X_test, y_test)



Random Forest:
Mejor Modelo: RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=200,
                       random_state=42)
Matriz de Confusión:
[[2188  220]
 [ 367   99]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Pagadores       0.86      0.91      0.88      2408
No Pagadores       0.31      0.21      0.25       466

    accuracy                           0.80      2874
   macro avg       0.58      0.56      0.57      2874
weighted avg       0.77      0.80      0.78      2874



In [3]:
# Función para entrenamiento y evaluación de modelos
def train_and_evaluate(model, params, X_train, y_train, X_test, y_test, model_name):
    grid_search = GridSearchCV(model, params, scoring='f1', cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Imprimir resultados para el modelo actual
    print(f"\nResultados para {model_name}:")
    print("Mejor Modelo:", best_model)
    print("Matriz de Confusión:")
    print(confusion_matrix(y_test, y_pred))
    print("\nReporte de Clasificación:")
    print(classification_report(y_test, y_pred, target_names=["Pagadores", "No Pagadores"]))

# Parámetros para Random Forest y XGBoost
params_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [10, 20]
}

params_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

# Ejecución y evaluación de cada modelo
print("Random Forest:")
train_and_evaluate(RandomForestClassifier(random_state=42), params_rf, X_train_balanced, y_train_balanced, X_test, y_test, model_name="Random Forest")

print("\nXGBoost:")
train_and_evaluate(XGBClassifier(random_state=42, eval_metric='logloss'), params_xgb, X_train_balanced, y_train_balanced, X_test, y_test, model_name="XGBoost")

Random Forest:

Resultados para Random Forest:
Mejor Modelo: RandomForestClassifier(max_depth=20, min_samples_split=10, n_estimators=200,
                       random_state=42)
Matriz de Confusión:
[[2188  220]
 [ 367   99]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Pagadores       0.86      0.91      0.88      2408
No Pagadores       0.31      0.21      0.25       466

    accuracy                           0.80      2874
   macro avg       0.58      0.56      0.57      2874
weighted avg       0.77      0.80      0.78      2874


XGBoost:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode


Resultados para XGBoost:
Mejor Modelo: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=5,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
Matriz de Confusión:
[[2274  134]
 [ 404   62]]

Reporte de Clasificación:
              precision    recall  f1-score   support

   Pagadores       0.85      0.94      0.89      2408
No Pagadores       0

Parameters: { "use_label_encoder" } are not used.

