In [1]:
import torch

In [2]:
# === Task 2, Cell 1: Imports & Load Processed Data ===
# We are in notebooks/02_supervised.ipynb

import pandas as pd
import numpy as np

# PyTorch - our deep learning library
import torch
import torch.nn as nn  # nn = Neural Network
import torch.optim as optim # optim = Optimizers (like Adam)
from torch.utils.data import Dataset, DataLoader

# Scikit-learn - for metrics and utilities
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

print("All libraries imported successfully.")

# --- Load the Processed Data ---
# We load the .parquet files we created in Task 1.
train_path = '../data/processed/train.parquet'
test_path = '../data/processed/test.parquet'

train_data = pd.read_parquet(train_path)
test_data = pd.read_parquet(test_path)

# 1. Separate features (X) and target (y)
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']

X_test = test_data.drop(columns=['target'])
y_test = test_data['target']

print(f"Training data loaded: X shape {X_train.shape}, y shape {y_train.shape}")
print(f"Test data loaded:     X shape {X_test.shape}, y shape {y_test.shape}")

print("\nFirst 5 rows of scaled training features:")
display(X_train.head())

All libraries imported successfully.
Training data loaded: X shape (312934, 12), y shape (312934,)
Test data loaded:     X shape (78234, 12), y shape (78234,)

First 5 rows of scaled training features:


Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,emp_length,pub_rec,revol_util,total_acc,open_acc,mort_acc,fico_range_low
41094,0.737034,1.744741,0.04016,0.31369,-0.970261,1.117687,-0.365734,-1.578159,0.791257,0.736024,0.169932,0.028159
91912,-1.078352,-0.573151,-0.02852,-0.50855,1.983979,1.117687,-0.365734,-0.549676,-0.112613,0.558369,-0.332658,0.676154
201663,0.684667,1.744741,1.33593,-0.182306,1.805061,1.117687,-0.365734,0.495395,0.051727,-0.152254,-0.835247,-0.295838
491903,-0.025195,-0.573151,1.292433,-0.150478,1.575626,-1.660889,-0.365734,-0.375498,-0.030443,0.203057,0.672522,0.676154
154509,1.784372,1.744741,1.734276,0.101499,0.048515,-0.271601,-0.365734,-0.354762,0.380407,1.091335,-0.835247,-0.619836


In [3]:
# === Task 2, Cell 2: Create a PyTorch Dataset ===

# First, we need to convert our pandas DataFrames into torch Tensors.
# A Tensor is the main data type used by PyTorch (like a numpy array).

# We want our data to be 'float32' (decimal numbers)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

print("--- Data Converted to Tensors ---")
print(f"X_train_tensor shape: {X_train_tensor.shape}, dtype: {X_train_tensor.dtype}")
print(f"y_train_tensor shape: {y_train_tensor.shape}, dtype: {y_train_tensor.dtype}")


# --- Create a custom Dataset class ---
# This is a standard PyTorch pattern.
# It tells PyTorch how to get one item ( __getitem__ ) and
# how many items there are ( __len__ ).

class LoanDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        # This just returns the total number of rows
        return len(self.features)

    def __getitem__(self, idx):
        # This returns one row of data at a time
        # 'idx' is the row number (e.g., 5)
        x = self.features[idx]
        y = self.targets[idx]
        # We need to add an extra dimension to y for the loss function
        return x, y.unsqueeze(0)

# Now, create instances of our Dataset
train_dataset = LoanDataset(X_train_tensor, y_train_tensor)
test_dataset = LoanDataset(X_test_tensor, y_test_tensor)

print("\n--- Datasets Created ---")
print(f"Length of train_dataset: {len(train_dataset)}")
print(f"Length of test_dataset:  {len(test_dataset)}")

# --- Create DataLoaders ---
# DataLoaders automatically handle batching, shuffling, etc.
# BATCH_SIZE = 1024 is a good starting point.
# It means we'll feed the model 1024 loans at a time.
BATCH_SIZE = 1024

train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=BATCH_SIZE, 
                          shuffle=True) # Shuffle training data each epoch

test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=BATCH_SIZE, 
                         shuffle=False) # No need to shuffle test data

print(f"\n--- DataLoaders Created ---")
print(f"Number of batches in train_loader: {len(train_loader)}")
print(f"Number of batches in test_loader:  {len(test_loader)}")

--- Data Converted to Tensors ---
X_train_tensor shape: torch.Size([312934, 12]), dtype: torch.float32
y_train_tensor shape: torch.Size([312934]), dtype: torch.float32

--- Datasets Created ---
Length of train_dataset: 312934
Length of test_dataset:  78234

--- DataLoaders Created ---
Number of batches in train_loader: 306
Number of batches in test_loader:  77


In [4]:
# === Task 2, Cell 3: Define the Neural Network ===

# How many features are going in? (e.g., 12)
INPUT_FEATURES = X_train.shape[1] 
# How many predictions are coming out? (e.g., 1)
OUTPUT_FEATURES = 1

class MLP(nn.Module):
    # This is the "blueprint" for our network
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        
        # We define our network as a sequence of layers
        self.layers = nn.Sequential(
            # 1. First hidden layer: takes 12 features in, outputs 64
            nn.Linear(input_dim, 64),
            nn.ReLU(),  # "Rectified Linear Unit" activation
            nn.Dropout(0.3), # Drops 30% of neurons to prevent overfitting
            
            # 2. Second hidden layer: takes 64 in, outputs 32
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # 3. Output layer: takes 32 in, outputs 1
            nn.Linear(32, output_dim),
            nn.Sigmoid() # Squashes the output to be between 0 and 1
        )

    # This function defines how data "flows" through the layers
    def forward(self, x):
        return self.layers(x)

# --- Create an instance of our model ---
model = MLP(INPUT_FEATURES, OUTPUT_FEATURES)

# Print the model's architecture
print("--- Model Architecture Created ---")
print(model)

# --- Define Loss Function and Optimizer ---

# Loss Function: How we measure "how wrong" the model is.
# "Binary Cross Entropy" is the standard for 0/1 classification.
criterion = nn.BCELoss()

# Optimizer: The "engine" that updates the model's weights.
# "Adam" is a smart, popular choice.
# lr=0.001 is the "learning rate" - how big of a step to take.
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("\n--- Loss Function and Optimizer Defined ---")
print(f"Loss Function: {criterion}")
print(f"Optimizer: {optimizer.__class__.__name__}")

--- Model Architecture Created ---
MLP(
  (layers): Sequential(
    (0): Linear(in_features=12, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=32, out_features=1, bias=True)
    (7): Sigmoid()
  )
)

--- Loss Function and Optimizer Defined ---
Loss Function: BCELoss()
Optimizer: Adam


In [5]:
# === Task 2, Cell 4: Define the Training Function ===
# This function will run one full pass over the training data

def train_epoch(model, data_loader, criterion, optimizer):
    
    # 1. Set the model to "training mode"
    #    This tells layers like Dropout that they should be active.
    model.train()
    
    total_loss = 0.0
    all_targets = []
    all_preds = []

    # 2. Loop over every batch of data in the data_loader
    for features, targets in data_loader:
        
        # 3. Clear old gradients
        #    PyTorch accumulates gradients, so we reset them each time.
        optimizer.zero_grad()
        
        # 4. Forward Pass: Make a prediction
        #    This "pushes" the 'features' through the model.
        predictions = model(features)
        
        # 5. Calculate Loss: How wrong were we?
        #    Compare the model's 'predictions' to the true 'targets'.
        loss = criterion(predictions, targets)
        
        # 6. Backward Pass: Calculate gradients
        #    This is the "learning" step. It calculates how much
        #    each tiny weight in the model contributed to the error.
        loss.backward()
        
        # 7. Update Weights: Tell the optimizer to take a step
        #    The optimizer uses the gradients to update the weights.
        optimizer.step()
        
        # --- Store results for metrics ---
        total_loss += loss.item() # .item() gets the raw number
        
        # 'predictions' are probabilities (e.g., 0.7).
        # We round them (0.7 -> 1.0) to get a final 0/1 prediction.
        preds_binary = torch.round(predictions)
        
        # .detach().numpy() moves data from PyTorch back to NumPy
        all_preds.extend(preds_binary.detach().numpy())
        all_targets.extend(targets.detach().numpy())

    # --- Calculate metrics for the whole epoch ---
    avg_loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds)
    
    return avg_loss, accuracy, f1

In [6]:
# === Task 2, Cell 5: Define the Evaluation Function ===
# This function will run one full pass over the *test* data

def evaluate_epoch(model, data_loader, criterion):
    
    # 1. Set the model to "evaluation mode"
    #    This tells layers like Dropout to turn OFF.
    model.eval()
    
    total_loss = 0.0
    all_targets = []
    all_preds_probs = [] # We'll store the raw probabilities for AUC

    # 2. Tell PyTorch not to calculate gradients
    #    This saves memory and computation.
    with torch.no_grad():
        
        # 3. Loop over every batch of data
        for features, targets in data_loader:
            
            # 4. Forward Pass: Make a prediction
            predictions = model(features)
            
            # 5. Calculate Loss
            loss = criterion(predictions, targets)
            
            # --- Store results for metrics ---
            total_loss += loss.item()
            
            # Store the raw probabilities (e.g., 0.7) for AUC
            all_preds_probs.extend(predictions.numpy())
            # Store the true targets
            all_targets.extend(targets.numpy())

    # --- Calculate metrics for the whole epoch ---
    avg_loss = total_loss / len(data_loader)
    
    # Convert probabilities to binary 0/1 predictions
    all_preds_binary = np.round(all_preds_probs)
    
    # Calculate all our metrics
    accuracy = accuracy_score(all_targets, all_preds_binary)
    f1 = f1_score(all_targets, all_preds_binary)
    
    # AUC is the most important metric!
    # It uses the raw probabilities, not the rounded 0/1 predictions.
    try:
        auc = roc_auc_score(all_targets, all_preds_probs)
    except ValueError:
        auc = 0.5 # Handle edge cases
    
    return avg_loss, accuracy, f1, auc

In [7]:
# === Task 2, Cell 6: Run the Training Loop ===
import time

NUM_EPOCHS = 5 # An epoch is one full pass over the training data

print("--- Starting Model Training ---")

for epoch in range(1, NUM_EPOCHS + 1):
    start_time = time.time()
    
    # 1. Train for one epoch
    train_loss, train_acc, train_f1 = train_epoch(
        model, train_loader, criterion, optimizer
    )
    
    # 2. Evaluate on the test set
    val_loss, val_acc, val_f1, val_auc = evaluate_epoch(
        model, test_loader, criterion
    )
    
    end_time = time.time()
    epoch_mins = (end_time - start_time) / 60
    
    # 3. Print the results for this epoch
    print(f"\nEpoch: {epoch:02}/{NUM_EPOCHS}")
    print(f"\tTime: {epoch_mins:.2f} minutes")
    print(f"\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}% | Train F1: {train_f1:.4f}")
    print(f"\tVal. Loss:  {val_loss:.4f} | Val. Acc:  {val_acc*100:.2f}% | Val. F1:  {val_f1:.4f}")
    print(f"\t*** Val. AUC: {val_auc:.4f} ***")

print("\n--- Training Complete ---")

--- Starting Model Training ---

Epoch: 01/5
	Time: 0.14 minutes
	Train Loss: 0.4740 | Train Acc: 79.87% | Train F1: 0.1006
	Val. Loss:  0.4471 | Val. Acc:  80.24% | Val. F1:  0.1658
	*** Val. AUC: 0.7297 ***

Epoch: 02/5
	Time: 0.15 minutes
	Train Loss: 0.4544 | Train Acc: 80.22% | Train F1: 0.1685
	Val. Loss:  0.4458 | Val. Acc:  80.37% | Val. F1:  0.1773
	*** Val. AUC: 0.7317 ***

Epoch: 03/5
	Time: 0.14 minutes
	Train Loss: 0.4526 | Train Acc: 80.24% | Train F1: 0.1744
	Val. Loss:  0.4456 | Val. Acc:  80.37% | Val. F1:  0.1935
	*** Val. AUC: 0.7321 ***

Epoch: 04/5
	Time: 0.14 minutes
	Train Loss: 0.4520 | Train Acc: 80.23% | Train F1: 0.1776
	Val. Loss:  0.4456 | Val. Acc:  80.37% | Val. F1:  0.1825
	*** Val. AUC: 0.7327 ***

Epoch: 05/5
	Time: 0.13 minutes
	Train Loss: 0.4513 | Train Acc: 80.25% | Train F1: 0.1779
	Val. Loss:  0.4452 | Val. Acc:  80.33% | Val. F1:  0.2348
	*** Val. AUC: 0.7329 ***

--- Training Complete ---


In [8]:
# === Task 2, Cell 7: Final Evaluation & Report ===

print("--- Running Final Evaluation on Test Set ---")

# Run the evaluation function one last time
final_loss, final_acc, final_f1, final_auc = evaluate_epoch(
    model, test_loader, criterion
)

print("\n--- Deep Learning Model Metrics (Test Set) ---")
print(f"     Accuracy: {final_acc*100:.2f}%")
print(f"     F1-Score: {final_f1:.4f}")
print(f"ROC AUC Score: {final_auc:.4f}")

# --- Generate a detailed Classification Report ---
# We need to get the full list of predictions and targets

model.eval()
all_targets = []
all_preds_probs = []

with torch.no_grad():
    for features, targets in test_loader:
        predictions = model(features)
        all_preds_probs.extend(predictions.numpy())
        all_targets.extend(targets.numpy())

# Convert probabilities to binary 0/1 predictions
all_preds_binary = np.round(all_preds_probs)

print("\n--- Classification Report ---")
# This report shows precision, recall, and f1-score for both classes (0 and 1)
report = classification_report(all_targets, all_preds_binary, target_names=['Class 0 (Paid)', 'Class 1 (Default)'])
print(report)

--- Running Final Evaluation on Test Set ---

--- Deep Learning Model Metrics (Test Set) ---
     Accuracy: 80.33%
     F1-Score: 0.2348
ROC AUC Score: 0.7329

--- Classification Report ---
                   precision    recall  f1-score   support

   Class 0 (Paid)       0.82      0.97      0.89     62468
Class 1 (Default)       0.54      0.15      0.23     15766

         accuracy                           0.80     78234
        macro avg       0.68      0.56      0.56     78234
     weighted avg       0.76      0.80      0.76     78234

