In [8]:
# Question 10: You are working as a data scientist at a financial institution to predict loan default. You have access to customer demographic and transaction history data. You decide to use ensemble techniques to increase model performance.
# Explain your step-by-step approach to:
# ● Choose between Bagging or Boosting
# ● Handle overfitting
# ● Select base models
# ● Evaluate performance using cross-validation
# ● Justify how ensemble learning improves decision-making in this real-world context.
# (Include your Python code and output in the code box below.)


import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import lightgbm as lgb

# Synthetic dataset simulating loan default (imbalanced)
X, y = make_classification(
    n_samples=5000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_clusters_per_class=2,
    weights=[0.85, 0.15],  # Imbalanced: 15% defaults
    random_state=42
)

# Prepare Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store metrics
auc_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    # Parameters for LightGBM (Boosting)
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'max_depth': 6,
        'verbose': -1,
        'seed': 42,
        'is_unbalance': True,  # Handle class imbalance
    }

    # Train with early stopping
    model = lgb.train(
        params,
        train_data,
        num_boost_round=500,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    # Predict probabilities
    y_pred_proba = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Evaluate
    auc_scores.append(roc_auc_score(y_val, y_pred_proba))
    precision_scores.append(precision_score(y_val, y_pred))
    recall_scores.append(recall_score(y_val, y_pred))
    f1_scores.append(f1_score(y_val, y_pred))

# Print average scores across folds
print(f"Average ROC AUC: {np.mean(auc_scores):.4f}")
print(f"Average Precision: {np.mean(precision_scores):.4f}")
print(f"Average Recall: {np.mean(recall_scores):.4f}")
print(f"Average F1-Score: {np.mean(f1_scores):.4f}")

Average ROC AUC: 0.9708
Average Precision: 0.9077
Average Recall: 0.8474
Average F1-Score: 0.8762
