In [2]:
!pip install d3rlpy

Collecting d3rlpy
  Downloading d3rlpy-2.8.1-py3-none-any.whl.metadata (11 kB)
Collecting gym>=0.26.0 (from d3rlpy)
  Downloading gym-0.26.2.tar.gz (721 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/721.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting structlog (from d3rlpy)
  Downloading structlog-25.4.0-py3-none-any.whl.metadata (7.6 kB)
Collecting colorama (from d3rlpy)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dataclasses-json (from d3rlpy)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting gymnasium==1.0.0 (from d3rlpy)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecti

In [None]:
import numpy as np
# D3RLPY imports are no longer needed
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split # Correct split function
from sklearn.preprocessing import MinMaxScaler # <-- IMPORT THE SCALER
from google.colab import drive
import os
import warnings
from tqdm import tqdm

# --- 0. Setup ---
warnings.filterwarnings('ignore')
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# --- 1a. Mount Google Drive ---
print("Mounting Google Drive...")
try:
    drive.mount('/content/drive', force_remount=True)
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    exit()

# --- 1b. Constants & Paths ---
DRIVE_MOUNT_POINT = '/content/drive/MyDrive/'
PROJECT_DIR = os.path.join(DRIVE_MOUNT_POINT, 'shodhAI')
TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, 'processed_data_train.npz')
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 'processed_data_test.npz')
# We'll save this as a PyTorch model now
RL_MODEL_SAVE_PATH = os.path.join(PROJECT_DIR, 'rl_q_model.pth')

# Model Hyperparameters
INPUT_DIM = -1  # Will be set after loading data
HIDDEN_DIM_1 = 256
HIDDEN_DIM_2 = 128
OUTPUT_DIM = 2  # Q-value for (Deny, Approve)
LEARNING_RATE = 0.0005
BATCH_SIZE = 2048
EPOCHS = 30

# --- 2. Load Processed Data ---
print("Loading processed data for RL...")
try:
    with np.load(TRAIN_DATA_PATH) as data:
        X_train_full = data['X'].astype(np.float32)
        r_train_full = data['r'].astype(np.float32)

    with np.load(TEST_DATA_PATH) as data:
        X_test = data['X'].astype(np.float32)
        r_test = data['r'].astype(np.float32)

    print(f"Full training observations shape: {X_train_full.shape}")
    print(f"Full training rewards shape: {r_train_full.shape}")
    print(f"Test observations shape: {X_test.shape}")
    print(f"Test rewards shape: {r_test.shape}")

    INPUT_DIM = X_train_full.shape[1]

except FileNotFoundError:
    print(f"Error: Data files not found in {PROJECT_DIR}")
    exit()
except Exception as e:
    print(f"An error occurred loading data: {e}")
    exit()

# --- 2b. Scale Rewards ---
print("Scaling rewards...")
r_train_full_reshaped = r_train_full.reshape(-1, 1)
r_test_reshaped = r_test.reshape(-1, 1)

reward_scaler = MinMaxScaler()
r_train_scaled = reward_scaler.fit_transform(r_train_full_reshaped)
r_test_scaled = reward_scaler.transform(r_test_reshaped)


scaled_zero_reward = reward_scaler.transform([[0.0]])[0, 0]

r_train_full = r_train_scaled.flatten()
r_test = r_test_scaled.flatten()

print("Rewards scaled to [0, 1] range.")
print(f"  Scaled value for 'Deny' (0 profit): {scaled_zero_reward:.4f}")

# --- 3. Create PyTorch DataLoaders ---
print("Creating PyTorch DataLoaders...")
# We split the data for training our Q-function
X_train, X_val, r_train, r_val = train_test_split(
    X_train_full, r_train_full, test_size=0.1, random_state=42
)

# Create Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
r_train_tensor = torch.tensor(r_train, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
r_val_tensor = torch.tensor(r_val, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)


# Create Datasets and DataLoaders
train_dataset = TensorDataset(X_train_tensor, r_train_tensor)
val_dataset = TensorDataset(X_val_tensor, r_val_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("DataLoaders created successfully.")


# --- 4. Define the Q-Policy Model ---
class QPolicyNet(nn.Module):
    def __init__(self, input_dim, hidden_dim_1, hidden_dim_2, output_dim):
        super(QPolicyNet, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim_1),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(hidden_dim_1, hidden_dim_2),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(hidden_dim_2, output_dim)
            
        )
    def forward(self, x):
        return self.network(x)

model = QPolicyNet(INPUT_DIM, HIDDEN_DIM_1, HIDDEN_DIM_2, OUTPUT_DIM).to(DEVICE)
print("Initialized Q-PolicyNet model.")

# --- 5. Define Loss Function and Optimizer ---

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)


deny_target = torch.tensor([scaled_zero_reward], dtype=torch.float32).to(DEVICE)

# --- 6. Train the RL Agent (Q-Function) ---
print("\n--- Starting Q-Policy Model Training ---")
best_val_loss = float('inf')
best_model_state = None

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0

    for inputs, r_approve_targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False):
        inputs = inputs.to(DEVICE)
        r_approve_targets = r_approve_targets.to(DEVICE)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass: get Q-values for both actions
        q_values = model(inputs)
        q_deny = q_values[:, 0]
        q_approve = q_values[:, 1]

        # Calculate loss
        
        loss_approve = criterion(q_approve, r_approve_targets.squeeze())
        loss_deny = criterion(q_deny, deny_target.expand_as(q_deny))

        # Total loss is the sum of both
        loss = loss_approve + loss_deny

        # Backward pass
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation loop
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for inputs, r_approve_targets in val_loader:
            inputs = inputs.to(DEVICE)
            r_approve_targets = r_approve_targets.to(DEVICE)

            q_values = model(inputs)
            q_deny = q_values[:, 0]
            q_approve = q_values[:, 1]

            loss_approve = criterion(q_approve, r_approve_targets.squeeze())
            loss_deny = criterion(q_deny, deny_target.expand_as(q_deny))
            loss = loss_approve + loss_deny
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Save the best model
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict()
        print(f"  > New best model found!")

print("--- RL Training Complete ---")

# --- 7. Save the Trained Model ---
print(f"Saving trained RL model to {RL_MODEL_SAVE_PATH}...")
torch.save(best_model_state, RL_MODEL_SAVE_PATH)
print("RL Model saved successfully.")

# --- 8. Evaluate the Policy (Direct Simulation Method) ---
print("\n--- Starting Policy Evaluation (Direct Simulation) ---")

# Load the best model weights
model.load_state_dict(best_model_state)
model.eval()

with torch.no_grad():
    # Get Q-values for the test set
    q_values_tensor = model(X_test_tensor.to(DEVICE))
    q_values = q_values_tensor.cpu().numpy()

# q_values shape is (n_samples, 2). Get the chosen action.
policy_actions = np.argmax(q_values, axis=1) # 0 = Deny, 1 = Approve

# Analyze the policy's decisions
n_approve = np.sum(policy_actions == 1)
n_deny = np.sum(policy_actions == 0)
approve_pct = n_approve / len(policy_actions) * 100

print(f"Policy decisions on test set ({len(policy_actions)} loans):")
print(f"  Approve: {n_approve} loans ({approve_pct:.2f}%)")
print(f"  Deny:    {n_deny} loans ({(100-approve_pct):.2f}%)")

# Calculate the simulated profit

simulated_profits = r_test * policy_actions

# Calculate total and average profit
total_simulated_profit = np.sum(simulated_profits)
avg_simulated_profit = np.mean(simulated_profits)

# --- 9. Report the Results ---
print("\n--- Test Set Performance (Task 3) ---")
print("\n-----------------------------------------")
print(f"Total Simulated SCALED Profit:   {total_simulated_profit:,.2f}")
print(f"Average Simulated SCALED Profit: {avg_simulated_profit:.4f}")
print("-----------------------------------------")
print("This value represents the simulated average SCALED REWARD [0, 1] per loan")
print("if this RL policy were used to make decisions on the test set.")

print("\n--- Task 3 Complete ---")



Using device: cuda:0
Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Loading processed data for RL...
Full training observations shape: (140866, 33)
Full training rewards shape: (140866,)
Test observations shape: (35217, 33)
Test rewards shape: (35217,)
Scaling rewards...
Rewards scaled to [0, 1] range.
  Scaled value for 'Deny' (0 profit): 0.7753
Creating PyTorch DataLoaders...
DataLoaders created successfully.
Initialized Q-PolicyNet model.

--- Starting Q-Policy Model Training ---




Epoch 1/30, Train Loss: 0.3227, Val Loss: 0.1339
  > New best model found!




Epoch 2/30, Train Loss: 0.1728, Val Loss: 0.1214
  > New best model found!




Epoch 3/30, Train Loss: 0.1545, Val Loss: 0.1168
  > New best model found!




Epoch 4/30, Train Loss: 0.1469, Val Loss: 0.1162
  > New best model found!




Epoch 5/30, Train Loss: 0.1426, Val Loss: 0.1148
  > New best model found!




Epoch 6/30, Train Loss: 0.1384, Val Loss: 0.1151




Epoch 7/30, Train Loss: 0.1365, Val Loss: 0.1149




Epoch 8/30, Train Loss: 0.1342, Val Loss: 0.1144
  > New best model found!




Epoch 9/30, Train Loss: 0.1322, Val Loss: 0.1146




Epoch 10/30, Train Loss: 0.1306, Val Loss: 0.1147




Epoch 11/30, Train Loss: 0.1301, Val Loss: 0.1144




Epoch 12/30, Train Loss: 0.1290, Val Loss: 0.1145




Epoch 13/30, Train Loss: 0.1280, Val Loss: 0.1149




Epoch 14/30, Train Loss: 0.1269, Val Loss: 0.1143
  > New best model found!




Epoch 15/30, Train Loss: 0.1261, Val Loss: 0.1139
  > New best model found!




Epoch 16/30, Train Loss: 0.1251, Val Loss: 0.1135
  > New best model found!




Epoch 17/30, Train Loss: 0.1251, Val Loss: 0.1135
  > New best model found!




Epoch 18/30, Train Loss: 0.1245, Val Loss: 0.1135




Epoch 19/30, Train Loss: 0.1239, Val Loss: 0.1131
  > New best model found!




Epoch 20/30, Train Loss: 0.1231, Val Loss: 0.1136




Epoch 21/30, Train Loss: 0.1224, Val Loss: 0.1131
  > New best model found!




Epoch 22/30, Train Loss: 0.1222, Val Loss: 0.1135




Epoch 23/30, Train Loss: 0.1216, Val Loss: 0.1133




Epoch 24/30, Train Loss: 0.1210, Val Loss: 0.1130
  > New best model found!




Epoch 25/30, Train Loss: 0.1203, Val Loss: 0.1134




Epoch 26/30, Train Loss: 0.1201, Val Loss: 0.1128
  > New best model found!




Epoch 27/30, Train Loss: 0.1196, Val Loss: 0.1127
  > New best model found!




Epoch 28/30, Train Loss: 0.1193, Val Loss: 0.1129




Epoch 29/30, Train Loss: 0.1188, Val Loss: 0.1129




Epoch 30/30, Train Loss: 0.1188, Val Loss: 0.1128
--- RL Training Complete ---
Saving trained RL model to /content/drive/MyDrive/shodhAI/rl_q_model.pth...
RL Model saved successfully.

--- Starting Policy Evaluation (Direct Simulation) ---
Policy decisions on test set (35217 loans):
  Approve: 2386 loans (6.78%)
  Deny:    32831 loans (93.22%)

--- Test Set Performance (Task 3) ---

-----------------------------------------
Total Simulated SCALED Profit:   1,913.67
Average Simulated SCALED Profit: 0.0543
-----------------------------------------
This value represents the simulated average SCALED REWARD [0, 1] per loan
if this RL policy were used to make decisions on the test set.

--- Task 3 Complete ---
