In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [3]:
data = pd.read_csv("Default.csv")
data.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.13895
3,No,No,529.250605,35704.49394
4,No,No,785.655883,38463.49588


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  10000 non-null  object 
 1   student  10000 non-null  object 
 2   balance  10000 non-null  float64
 3   income   10000 non-null  float64
dtypes: float64(2), object(2)
memory usage: 312.6+ KB


In [5]:
data = data.sample(n=500, random_state=42).reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   default  500 non-null    object 
 1   student  500 non-null    object 
 2   balance  500 non-null    float64
 3   income   500 non-null    float64
dtypes: float64(2), object(2)
memory usage: 15.8+ KB


## Columnas explicativas y salida

In [6]:
data["student_bin"] = (data["student"] == "Yes").astype(int)
features = ["balance", "income", "student_bin"]
X = data[features].astype(float)
y = (data['default'] == 'Yes').astype(int)


## Train y test 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

## Regresión logística

In [8]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_proba_logreg = logreg.predict_proba(X_test)[:, 1]
y_pred_logreg = (y_proba_logreg >= 0.5).astype(int)

auc_logreg = roc_auc_score(y_test, y_proba_logreg)
acc_logreg = accuracy_score(y_test, y_pred_logreg)

print(f"AUC Logistic Regression:      {auc_logreg:.4f}")
print(f"Accuracy Logistic Regression: {acc_logreg:.4f}")

AUC Logistic Regression:      0.9884
Accuracy Logistic Regression: 0.9867


## Bagging 

In [9]:
n_trees = 500
n_train = X_train.shape[0]

trees = []
selected_features_list = []  

for i in range(n_trees):
    
    cols = np.random.choice(features, size=2, replace=False)
    selected_features_list.append(cols)
    
    
    idx = np.random.choice(n_train, size=n_train, replace=True)
    Xb = X_train[cols].iloc[idx]
    yb = y_train.iloc[idx]
    
    
    tree = DecisionTreeClassifier(random_state=i)
    tree.fit(Xb, yb)
    
    trees.append(tree)


## AUC Y Accuracy 

In [10]:
proba_all = []

for tree, cols in zip(trees, selected_features_list):
    p = tree.predict_proba(X_test[cols])[:, 1]
    proba_all.append(p)

proba_all = np.array(proba_all)
bagging_prob = proba_all.mean(axis=0)

# Convertir probas a clase
y_pred_bagging = (bagging_prob >= 0.5).astype(int)

# AUC y Accuracy Bagging
auc_bagging = roc_auc_score(y_test, bagging_prob)
acc_bagging = accuracy_score(y_test, y_pred_bagging)

print(f"\nAUC Bagging (500 árboles):    {auc_bagging:.4f}")
print(f"Accuracy Bagging:             {acc_bagging:.4f}")


AUC Bagging (500 árboles):    0.9792
Accuracy Bagging:             0.9667


## Comparar

In [12]:

print(f"AUC Logistic Regression:      {auc_logreg:.4f}")
print(f"AUC Bagging:                  {auc_bagging:.4f}")
print(f"Accuracy Logistic Regression: {acc_logreg:.4f}")
print(f"Accuracy Bagging:             {acc_bagging:.4f}")

AUC Logistic Regression:      0.9884
AUC Bagging:                  0.9792
Accuracy Logistic Regression: 0.9867
Accuracy Bagging:             0.9667


Después de ver las comparaciones, la Regresión Logística le gana al Bagging en este dataset. Tiene mejores números en todo:

- AUC más alto

- Accuracy más alto

La Regresión Logística llega a casi 0.99 de AUC y 0.9867 de accuracy, mientras que Bagging se queda un poco atrás. Para estos datos, la regresión funciona mejor y clasifica con más precisión, no hizo falta un modelo más complejo, la regresión normalita fue la que mejor rindió.