In [1]:
"""
Task 3: Model 2 - The Offline Reinforcement Learning Agent

Loads the preprocessed data from Task 1.
Frames the problem as a one-step MDP (Contextual Bandit).
Engineers rewards based on financial outcomes (P&L).
Trains an offline RL agent (Discrete CQL) using d3rlpy.

Saves the following files:
- models/cql_agent.d3
- models/reward_scaler.pkl
"""

'\nTask 3: Model 2 - The Offline Reinforcement Learning Agent\n\nLoads the preprocessed data from Task 1.\nFrames the problem as a one-step MDP (Contextual Bandit).\nEngineers rewards based on financial outcomes (P&L).\nTrains an offline RL agent (Discrete CQL) using d3rlpy.\n\nSaves the following files:\n- models/cql_agent.d3\n- models/reward_scaler.pkl\n'

In [3]:
!pip install d3rlpy

Collecting d3rlpy
  Downloading d3rlpy-2.8.1-py3-none-any.whl.metadata (11 kB)
Collecting gym>=0.26.0 (from d3rlpy)
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting structlog (from d3rlpy)
  Downloading structlog-25.5.0-py3-none-any.whl.metadata (9.5 kB)
Collecting colorama (from d3rlpy)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dataclasses-json (from d3rlpy)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting gymnasium==1.0.0 (from d3rlpy)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json->d3rlpy)
  Downloading marshmallow-3.26.1-py3-none-any.whl.met

In [5]:
import numpy as np
import pandas as pd
import d3rlpy
from d3rlpy.dataset import MDPDataset
from sklearn.preprocessing import StandardScaler
import joblib
import os

# --- Configuration ---
RANDOM_SEED = 42
DATA_DIR = 'data'
MODEL_DIR = 'models'

# Set Random Seed
np.random.seed(RANDOM_SEED)

In [7]:
def main():

    # --- Load Preprocessed Data ---
    print(f"Loading data from '{DATA_DIR}'...")
    try:
        X_train_final = joblib.load(os.path.join(DATA_DIR, 'X_train_final.pkl'))
        df_model = joblib.load(os.path.join(DATA_DIR, 'df_model_for_rewards.pkl'))
        train_indices = joblib.load(os.path.join(DATA_DIR, 'train_indices.pkl'))
    except FileNotFoundError:
        print("Error: Processed data files not found.")
        print("Please run `task_1_preprocessing.py` first.")
        return

    # Create model directory if it doesn't exist
    if not os.path.exists(MODEL_DIR):
        os.makedirs(MODEL_DIR)
        print(f"Created directory: {MODEL_DIR}")

    # ---  Engineer RL Dataset Components ---
    print("Step 1: Engineering RL dataset (States, Actions, Rewards, Terminals)...")

    # States (s): Preprocessed features from Task 1
    observations_train = X_train_final.to_numpy()

    # Actions (a): In this dataset of *accepted* loans, the observed action was always Approve (1).
    actions_train = np.ones(len(observations_train), dtype=np.int32)

    # Rewards (R): Calculate Profit/Loss based on actual outcomes.

    print("Calculating raw rewards...")
    rewards_train_raw = np.where(
        df_model.loc[train_indices, 'is_default'] == 0, # Fully Paid
        df_model.loc[train_indices, 'total_rec_int'],    # Profit = interest received
        df_model.loc[train_indices, 'total_pymnt'] - df_model.loc[train_indices, 'loan_amnt'] # Loss
    )
    print(f"Sample raw rewards (Train): {rewards_train_raw[:5]}")

    # Reward Scaling (Important for Q-learning stability)
    reward_scaler = StandardScaler()
    rewards_train_scaled = reward_scaler.fit_transform(rewards_train_raw.reshape(-1, 1)).flatten()
    print(f"Mean scaled reward (Train): {np.mean(rewards_train_scaled):.2f}")

    # Terminals (d): This is a one-step decision problem (contextual bandit).
    terminals_train = np.ones(len(observations_train), dtype=np.float32)

    print("RL dataset components assembled.")

    # --- Create d3rlpy Dataset ---
    print("Step 2: Building d3rlpy MDPDataset...")
    # Note: This is a one-step MDP.
    dataset = MDPDataset(
        observations=observations_train,
        actions=actions_train,
        rewards=rewards_train_scaled,
        terminals=terminals_train
    )
    print("MDPDataset created.")

    # --- Configure and Train CQL Agent ---
    print("Step 3: Configuring and training the Discrete CQL agent...")

    # Configure CQL for a discrete action space {0, 1}
    cql_config = d3rlpy.algos.DiscreteCQLConfig(
        batch_size=128,
        learning_rate=6.25e-5,
        alpha=1.0,
    )

    cql_agent = cql_config.create(device=False) # Use CPU

    print("Training CQL agent...")
    cql_agent.fit(
        dataset,
        n_steps=50000, # Can increase for better convergence
        n_steps_per_epoch=5000
    )
    print("--- CQL Training Complete ---")

    # --- Save Agent and Reward Scaler ---
    agent_path = os.path.join(MODEL_DIR, 'cql_agent.d3')
    scaler_path = os.path.join(MODEL_DIR, 'reward_scaler.pkl')

    cql_agent.save_model(agent_path)
    joblib.dump(reward_scaler, scaler_path)

    print(f"CQL agent saved to {agent_path}")
    print(f"Reward scaler saved to {scaler_path}")
    print("--- Task 3: CQL Training Complete ---")

if __name__ == "__main__":
    main()


--- Task 3: Building and Training Offline RL Agent (CQL) ---
Loading data from 'data'...
Step 1: Engineering RL dataset (States, Actions, Rewards, Terminals)...
Calculating raw rewards...
Sample raw rewards (Train): [-3788.31  2392.37   373.43 -5368.02  4087.2 ]
Mean scaled reward (Train): -0.00
RL dataset components assembled.
Step 2: Building d3rlpy MDPDataset...
[2m2025-10-30 07:51.59[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(132,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(1,)])[0m
[2m2025-10-30 07:51.59[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-10-30 07:52.01[0m [[32m[1minfo     [0m] [1mAction size has been automatically deter

Epoch 1/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 07:53.36[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=1 step=5000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.004741993665695191, 'time_algorithm_update': 0.012149934673309326, 'loss': 0.2892695835068822, 'td_loss': 0.2765618439823389, 'conservative_loss': 0.012707739434438554, 'time_step': 0.017119435310363768}[0m [36mstep[0m=[35m5000[0m
[2m2025-10-30 07:53.36[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_5000.d3[0m


Epoch 2/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 07:55.03[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=2 step=10000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0048064826965332035, 'time_algorithm_update': 0.01205392894744873, 'loss': 0.2741426198735833, 'td_loss': 0.27412186073064804, 'conservative_loss': 2.0759493547029708e-05, 'time_step': 0.01708390989303589}[0m [36mstep[0m=[35m10000[0m
[2m2025-10-30 07:55.03[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_10000.d3[0m


Epoch 3/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 07:56.31[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=3 step=15000[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0049158769130706785, 'time_algorithm_update': 0.01225208330154419, 'loss': 0.27354971437007186, 'td_loss': 0.27354708587527277, 'conservative_loss': 2.6286488332516456e-06, 'time_step': 0.01739455499649048}[0m [36mstep[0m=[35m15000[0m
[2m2025-10-30 07:56.31[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_15000.d3[0m


Epoch 4/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 07:58.01[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=4 step=20000[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0051023021221160885, 'time_algorithm_update': 0.012558697700500488, 'loss': 0.2731385242253542, 'td_loss': 0.2731382327541709, 'conservative_loss': 2.914220065349582e-07, 'time_step': 0.017900532150268555}[0m [36mstep[0m=[35m20000[0m
[2m2025-10-30 07:58.01[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_20000.d3[0m


Epoch 5/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 07:59.27[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=5 step=25000[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.004829755353927613, 'time_algorithm_update': 0.011901504421234131, 'loss': 0.27215460280925036, 'td_loss': 0.27215458275675775, 'conservative_loss': 2.1228939294815063e-08, 'time_step': 0.01695895276069641}[0m [36mstep[0m=[35m25000[0m
[2m2025-10-30 07:59.27[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_25000.d3[0m


Epoch 6/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 08:00.55[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=6 step=30000[0m [36mepoch[0m=[35m6[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00491461763381958, 'time_algorithm_update': 0.012299894523620606, 'loss': 0.2737641074821353, 'td_loss': 0.2737641074821353, 'conservative_loss': 3.073364496231079e-10, 'time_step': 0.01745253119468689}[0m [36mstep[0m=[35m30000[0m
[2m2025-10-30 08:00.55[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_30000.d3[0m


Epoch 7/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 08:02.21[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=7 step=35000[0m [36mepoch[0m=[35m7[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.004801615905761719, 'time_algorithm_update': 0.011966880130767823, 'loss': 0.27100280695855616, 'td_loss': 0.27100280695855616, 'conservative_loss': 8.009374141693115e-12, 'time_step': 0.01700148000717163}[0m [36mstep[0m=[35m35000[0m
[2m2025-10-30 08:02.21[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_35000.d3[0m


Epoch 8/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 08:03.47[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=8 step=40000[0m [36mepoch[0m=[35m8[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00482250714302063, 'time_algorithm_update': 0.012024013662338257, 'loss': 0.26945861524790526, 'td_loss': 0.26945861524790526, 'conservative_loss': 3.1664967536926268e-12, 'time_step': 0.01708188319206238}[0m [36mstep[0m=[35m40000[0m
[2m2025-10-30 08:03.47[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_40000.d3[0m


Epoch 9/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 08:05.15[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=9 step=45000[0m [36mepoch[0m=[35m9[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0048926291465759275, 'time_algorithm_update': 0.012241019344329834, 'loss': 0.2709443375915289, 'td_loss': 0.2709443375915289, 'conservative_loss': 2.9802322387695314e-12, 'time_step': 0.017358287143707275}[0m [36mstep[0m=[35m45000[0m
[2m2025-10-30 08:05.15[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_45000.d3[0m


Epoch 10/10:   0%|          | 0/5000 [00:00<?, ?it/s]

[2m2025-10-30 08:06.41[0m [[32m[1minfo     [0m] [1mDiscreteCQL_20251030075210: epoch=10 step=50000[0m [36mepoch[0m=[35m10[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.004824909257888794, 'time_algorithm_update': 0.012061398315429687, 'loss': 0.26922877210974694, 'td_loss': 0.26922877210974694, 'conservative_loss': 2.7939677238464356e-12, 'time_step': 0.017117854404449462}[0m [36mstep[0m=[35m50000[0m
[2m2025-10-30 08:06.41[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DiscreteCQL_20251030075210/model_50000.d3[0m
--- CQL Training Complete ---
CQL agent saved to models/cql_agent.d3
Reward scaler saved to models/reward_scaler.pkl
--- Task 3: CQL Training Complete ---
