In [11]:
"""
Task 2: Model 1 - The Predictive Deep Learning Model

Loads the preprocessed data from Task 1.
Builds, trains, and evaluates a Multi-Layer Perceptron (MLP) using
TensorFlow/Keras to predict the probability of default.

Saves the following files:
- models/mlp_model.keras
- models/mlp_test_pred_probs.npy
"""

'\nTask 2: Model 1 - The Predictive Deep Learning Model\n\nLoads the preprocessed data from Task 1.\nBuilds, trains, and evaluates a Multi-Layer Perceptron (MLP) using\nTensorFlow/Keras to predict the probability of default.\n\nSaves the following files:\n- models/mlp_model.keras\n- models/mlp_test_pred_probs.npy\n'

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import joblib
import os

RANDOM_SEED = 42
DATA_DIR = 'data'
MODEL_DIR = 'models'

# Set Random Seeds for Reproducibility
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [13]:
# --- Helper Function ---
def build_model(input_shape):
    """Defines the Keras model architecture."""
    model = keras.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

In [15]:
def main():

    # Create model directory if it doesn't exist
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
        print(f"Created directory: {MODEL_DIR}")

    # --- 2. Load Preprocessed Data ---
    print(f"Loading data from '{DATA_DIR}'...")
    try:
        X_train_final = joblib.load(os.path.join(DATA_DIR, 'X_train_final.pkl'))
        y_train = joblib.load(os.path.join(DATA_DIR, 'y_train.pkl'))
        X_test_final = joblib.load(os.path.join(DATA_DIR, 'X_test_final.pkl'))
        y_test = joblib.load(os.path.join(DATA_DIR, 'y_test.pkl'))
    except FileNotFoundError:
        print("Error: Processed data files not found.")
        return

    # --- Prepare Data for TensorFlow ---
    X_train_np = X_train_final.to_numpy()
    y_train_np = y_train.to_numpy()
    X_test_np = X_test_final.to_numpy()
    y_test_np = y_test.to_numpy()
    n_features = X_train_np.shape[1]
    print(f"Data converted to NumPy. Training with {n_features} features.")

    # --- Handle Class Imbalance ---
    print("Calculating class weights...")
    weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_np), y=y_train_np)
    class_weights = {0: weights[0], 1: weights[1]}
    print(f"Calculated weights: {class_weights}")

    # --- Build and Compile MLP Model ---
    print("Building and compiling Keras MLP model...")
    model = build_model(n_features)
    model.summary()

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(name='auc')]
    )

    # --- Train Model ---
    print("Training MLP model...")
    history = model.fit(
        X_train_np,
        y_train_np,
        epochs=20,
        batch_size=128,
        validation_split=0.2,
        class_weight=class_weights,
        verbose=1
    )
    print("MLP Training complete.")

    # --- Evaluate MLP Model ---
    print("\nEvaluating MLP model on the test set...")
    y_pred_probs_mlp = model.predict(X_test_np).flatten()
    y_pred_classes_mlp = (y_pred_probs_mlp > 0.5).astype(int) # Using 0.5 threshold

    auc_mlp = roc_auc_score(y_test_np, y_pred_probs_mlp)
    f1_mlp = f1_score(y_test_np, y_pred_classes_mlp)

    print("\n--- MLP Evaluation Results ---")
    print(f"Test Set AUC: {auc_mlp:.4f}")
    print(f"Test Set F1-Score (threshold 0.5): {f1_mlp:.4f}")
    print("\nClassification Report (threshold 0.5):")
    print(classification_report(y_test_np, y_pred_classes_mlp, target_names=['Fully Paid (0)', 'Defaulted (1)']))

    # --- Save Model and Predictions ---
    model_path = os.path.join(MODEL_DIR, 'mlp_model.keras')
    preds_path = os.path.join(MODEL_DIR, 'mlp_test_pred_probs.npy')

    model.save(model_path)
    np.save(preds_path, y_pred_probs_mlp)
    print(f"MLP model saved to {model_path}")
    print(f"MLP test predictions saved to {preds_path}")

    print("--- Task 2: MLP Training Complete ---")

if __name__ == "__main__":
    main()

Loading data from 'data'...
Data converted to NumPy. Training with 132 features.
Calculating class weights...
Calculated weights: {0: np.float64(0.6021198205894916), 1: np.float64(2.9481045751633985)}
Building and compiling Keras MLP model...


Training MLP model...
Epoch 1/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - auc: 0.6777 - loss: 0.6469 - val_auc: 0.6793 - val_loss: 0.6278
Epoch 2/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - auc: 0.7010 - loss: 0.6328 - val_auc: 0.6824 - val_loss: 0.6352
Epoch 3/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - auc: 0.7040 - loss: 0.6305 - val_auc: 0.6832 - val_loss: 0.6268
Epoch 4/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - auc: 0.7057 - loss: 0.6295 - val_auc: 0.6839 - val_loss: 0.6197
Epoch 5/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - auc: 0.7066 - loss: 0.6283 - val_auc: 0.6841 - val_loss: 0.6182
Epoch 6/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4ms/step - auc: 0.7088 - loss: 0.6269 - val_auc: 0.6842 - val_loss: 0.6220
Epoch 7/20
[1m2820/2820[0m [32m━━━━━━━━━━━━━━