In [17]:
!pip install torch tqdm



In [20]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from google.colab import drive
import os
import warnings
from tqdm import tqdm
import copy # Needed to save the best model state

# --- 0. Setup ---
warnings.filterwarnings('ignore')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- 1a. Mount Google Drive ---
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    exit()

# --- 1b. Constants & Paths ---
DRIVE_MOUNT_POINT = '/content/drive/MyDrive/'
PROJECT_DIR = os.path.join(DRIVE_MOUNT_POINT, 'shodhAI')
TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, 'processed_data_train.npz')
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 'processed_data_test.npz')
MODEL_SAVE_PATH = os.path.join(PROJECT_DIR, 'dl_model_weights.pth')
# Increase epochs to allow for better convergence
EPOCHS = 30
print(f"Training epochs set to: {EPOCHS}")

# --- 1c. Hyperparameter Tuning Grid ---
# Expand the grid to try deeper networks and slower learning rates
HYPERPARAM_GRID = [
    {
        'lr': 0.0005, 'batch_size': 2048,
        'hidden_1': 256, 'hidden_2': 128, 'hidden_3': 0, 'dropout': 0.4
    },
    {
        'lr': 0.001, 'batch_size': 1024,
        'hidden_1': 512, 'hidden_2': 256, 'hidden_3': 128, 'dropout': 0.5
    },
    {
        'lr': 0.0001, 'batch_size': 2048,
        'hidden_1': 256, 'hidden_2': 128, 'hidden_3': 0, 'dropout': 0.4
    },
]
print(f"Starting hyperparameter search over {len(HYPERPARAM_GRID)} combinations...")

# --- 2. Load Processed Data ---
print("Loading processed data...")
try:
    with np.load(TRAIN_DATA_PATH) as data:
        X_train_full = data['X']
        y_train_full = data['y']

    with np.load(TEST_DATA_PATH) as data:
        X_test = data['X']
        y_test = data['y']

    # Set the input dimension for the model
    INPUT_DIM = X_train_full.shape[1]
    print(f"Model Input Dimension set to: {INPUT_DIM}")

    # --- 2b. Create Train/Validation Split ---
    X_train_tune, X_val, y_train_tune, y_val = train_test_split(
        X_train_full, y_train_full,
        test_size=0.2, # 20% for validation
        random_state=42,
        stratify=y_train_full
    )

    print(f"Full training data shape: {X_train_full.shape}")
    print(f"  > New tuning-train set shape: {X_train_tune.shape}")
    print(f"  > New validation set shape: {X_val.shape}")
    print(f"Test data shape: {X_test.shape}")

    # --- 2c. Calculate Class Weights ---
    neg_count = np.sum(y_train_tune == 0)
    pos_count = np.sum(y_train_tune == 1)
    pos_weight = neg_count / pos_count
    pos_weight_tensor = torch.tensor([pos_weight], dtype=torch.float32).to(DEVICE)

    print(f"Calculated pos_weight for 'Default' class: {pos_weight:.2f}")

except FileNotFoundError:
    print(f"Error: Data files not found in {PROJECT_DIR}")
    exit()
except Exception as e:
    print(f"An error occurred loading data: {e}")
    exit()

# --- 3. Define the DL Model (MLP) ---
# Update model to dynamically create 2 or 3 layers
class LoanDefaultClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, output_dim, dropout_rate):
        super(LoanDefaultClassifier, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, hidden_dim_1))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_rate))

        layers.append(nn.Linear(hidden_dim_1, hidden_dim_2))
        layers.append(nn.ReLU())
        layers.append(nn.Dropout(dropout_rate))

        # Add a third hidden layer only if hidden_dim_3 is greater than 0
        if hidden_dim_3 > 0:
            layers.append(nn.Linear(hidden_dim_2, hidden_dim_3))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            # The final layer's input dim is now hidden_dim_3
            layers.append(nn.Linear(hidden_dim_3, output_dim))
        else:
            # The final layer's input dim is hidden_dim_2
            layers.append(nn.Linear(hidden_dim_2, output_dim))

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

# --- 4. Hyperparameter Tuning Loop ---
best_val_auc = -1
best_model_state = None
best_params = None

# Create test dataloader
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(dataset=test_dataset, batch_size=4096, shuffle=False)

# Create validation dataloader
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(dataset=val_dataset, batch_size=4096, shuffle=False)

# Create training dataloader
X_train_tune_tensor = torch.tensor(X_train_tune, dtype=torch.float32)
y_train_tune_tensor = torch.tensor(y_train_tune, dtype=torch.float32).view(-1, 1)
train_tune_dataset = TensorDataset(X_train_tune_tensor, y_train_tune_tensor)

for i, params in enumerate(HYPERPARAM_GRID):
    print(f"\n--- Tuning Run {i+1}/{len(HYPERPARAM_GRID)} ---")
    print(f"Params: {params}")

    # --- 4a. Setup model for this run ---
    model = LoanDefaultClassifier(
        INPUT_DIM,
        params['hidden_1'],
        params['hidden_2'],
        params['hidden_3'],
        1,
        params['dropout']
    ).to(DEVICE)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor)
    optimizer = optim.Adam(model.parameters(), lr=params['lr'])

    train_loader = DataLoader(
        dataset=train_tune_dataset,
        batch_size=params['batch_size'],
        shuffle=True
    )

    # --- 4b. Training Loop ---
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0

        # Use tqdm for a progress bar on the inner loop
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False):
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Print loss just for the last epoch
        if epoch == EPOCHS - 1:
            avg_loss = total_loss / len(train_loader)
            print(f"  Epoch [{epoch+1}/{EPOCHS}], Avg Training Loss: {avg_loss:.4f}")

    # --- 4c. Validation Loop ---
    model.eval()
    val_preds_probs = []
    val_labels_list = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            outputs = model(inputs)
            probs = torch.sigmoid(outputs)
            val_preds_probs.extend(probs.cpu().numpy())
            val_labels_list.extend(labels.cpu().numpy())

    val_preds_probs = np.array(val_preds_probs).flatten()
    val_labels = np.array(val_labels_list).flatten()

    try:
        val_auc = roc_auc_score(val_labels, val_preds_probs)
        print(f"  Validation AUC: {val_auc:.4f}")

        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_params = params
            best_model_state = copy.deepcopy(model.state_dict())
            print(f"  *** New Best Model Found (AUC: {best_val_auc:.4f}) ***")

    except ValueError:
        print("  Error calculating validation AUC.")

print("\n--- Hyperparameter Tuning Complete ---")
print(f"Best Validation AUC: {best_val_auc:.4f}")
print(f"Best Parameters: {best_params}")

# --- 5. Final Model Setup & Evaluation ---
print("\nLoading best model for final test set evaluation...")
# Instantiate the best model
if best_params is None:
    print("Error: No best model was found from tuning. Exiting.")
    exit()

final_model = LoanDefaultClassifier(
    INPUT_DIM,
    best_params['hidden_1'],
    best_params['hidden_2'],
    best_params['hidden_3'],
    1,
    best_params['dropout']
).to(DEVICE)

# Load the saved best weights
if best_model_state is not None:
    final_model.load_state_dict(best_model_state)
else:
    print("Error: No best model was saved. Exiting.")
    exit()


# --- 6. Save the Final Model ---
print(f"Saving best model weights to {MODEL_SAVE_PATH}...")
torch.save(final_model.state_dict(), MODEL_SAVE_PATH)
print("Model saved successfully.")

# --- 7. Final Evaluation on Test Set ---
print("\n--- Starting Final Model Evaluation on TEST SET ---")
final_model.eval()
all_preds_probs = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        outputs = final_model(inputs)
        probs = torch.sigmoid(outputs)
        all_preds_probs.extend(probs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

all_preds_probs = np.array(all_preds_probs).flatten()
all_labels = np.array(all_labels).flatten()

# --- 8. Final Metrics Calculation ---
try:
    auc = roc_auc_score(all_labels, all_preds_probs)

    # Find the optimal threshold
    precisions, recalls, thresholds = precision_recall_curve(all_labels, all_preds_probs)
    f1_scores = (2 * precisions * recalls) / (precisions + recalls + 1e-10)
    best_f1_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_f1_idx]
    best_f1 = f1_scores[best_f1_idx]

    all_preds_binary_opt = (all_preds_probs >= best_threshold).astype(int)
    precision_opt = precision_score(all_labels, all_preds_binary_opt)
    recall_opt = recall_score(all_labels, all_preds_binary_opt)

    # --- 9. Final Report ---
    print("\n--- Test Set Performance (Task 2) ---")
    print(f"AUC (Area Under the ROC Curve): {auc:.4f}")
    print("-----------------------------------------")
    print(f"Optimal Threshold (for max F1): {best_threshold:.4f}")
    print("Metrics at optimal threshold:")
    print(f"  Best F1-Score:                  {best_f1:.4f}")
    print(f"  Precision (at best F1):         {precision_opt:.4f}")
    print(f"  Recall (at best F1):            {recall_opt:.4f}")

except ValueError as e:
    print(f"Error calculating metrics: {e}")
except Exception as e:
    print(f"An error occurred during final evaluation: {e}")


print("\n--- Task 2 Complete ---")



Using device: cpu
Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Training epochs set to: 30
Starting hyperparameter search over 3 combinations...
Loading processed data...
Model Input Dimension set to: 33
Full training data shape: (140866, 33)
  > New tuning-train set shape: (112692, 33)
  > New validation set shape: (28174, 33)
Test data shape: (35217, 33)
Calculated pos_weight for 'Default' class: 4.02

--- Tuning Run 1/3 ---
Params: {'lr': 0.0005, 'batch_size': 2048, 'hidden_1': 256, 'hidden_2': 128, 'hidden_3': 0, 'dropout': 0.4}




  Epoch [30/30], Avg Training Loss: 0.9704
  Validation AUC: 0.7334
  *** New Best Model Found (AUC: 0.7334) ***

--- Tuning Run 2/3 ---
Params: {'lr': 0.001, 'batch_size': 1024, 'hidden_1': 512, 'hidden_2': 256, 'hidden_3': 128, 'dropout': 0.5}




  Epoch [30/30], Avg Training Loss: 0.9631
  Validation AUC: 0.7315

--- Tuning Run 3/3 ---
Params: {'lr': 0.0001, 'batch_size': 2048, 'hidden_1': 256, 'hidden_2': 128, 'hidden_3': 0, 'dropout': 0.4}




  Epoch [30/30], Avg Training Loss: 0.9819
  Validation AUC: 0.7322

--- Hyperparameter Tuning Complete ---
Best Validation AUC: 0.7334
Best Parameters: {'lr': 0.0005, 'batch_size': 2048, 'hidden_1': 256, 'hidden_2': 128, 'hidden_3': 0, 'dropout': 0.4}

Loading best model for final test set evaluation...
Saving best model weights to /content/drive/MyDrive/shodhAI/dl_model_weights.pth...
Model saved successfully.

--- Starting Final Model Evaluation on TEST SET ---

--- Test Set Performance (Task 2) ---
AUC (Area Under the ROC Curve): 0.7311
-----------------------------------------
Optimal Threshold (for max F1): 0.5160
Metrics at optimal threshold:
  Best F1-Score:                  0.4470
  Precision (at best F1):         0.3410
  Recall (at best F1):            0.6485

--- Task 2 Complete ---
