In [1]:
# === Task 3, Cell 1: Imports (Final - Using FQEConfig) ===
import pandas as pd
import numpy as np
import joblib
import torch

from sklearn.model_selection import train_test_split

# The Offline Reinforcement Learning library
import d3rlpy

# d3rlpy components for the algorithm
from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import DiscreteCQLConfig

# --- CORRECTED ---
# Components for DISCRETE Off-Policy Evaluation (OPE)
# We use the Discrete evaluator but the generic FQEConfig
from d3rlpy.ope import FQEConfig # The generic config class
from d3rlpy.ope import DiscreteFQE
from d3rlpy.metrics import InitialStateValueEstimationEvaluator
from d3rlpy.preprocessing import MinMaxRewardScaler

use_gpu = torch.cuda.is_available()
print(f"Using GPU: {use_gpu}  (expected: False for Intel Iris)")
print(f"d3rlpy version: {d3rlpy.__version__}")
print("All libraries imported successfully.")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from .autonotebook import tqdm as notebook_tqdm


Using GPU: False  (expected: False for Intel Iris)
d3rlpy version: 2.8.1
All libraries imported successfully.


In [2]:
# === Task 3, Cell 2: Load Data (Corrected Filename & Rewards) ===

# 1. Load the *original* (unscaled) data again
data_path = '../data/raw/accepted_2007_to_2018Q4.csv' # Corrected filename

try:
    # Add low_memory=False to suppress the DtypeWarning
    df = pd.read_csv(data_path, nrows=500000, low_memory=False)
except FileNotFoundError:
    print(f"ERROR: File not found at {data_path}")
    print("Please make sure your file is named correctly and in the 'data/raw' folder.")
    raise

print(f"Original raw data loaded: {df.shape}")

# 2. Re-create the binary 'target' variable (same as Task 1)
bad_statuses = [
    'Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off'
]
good_statuses = [
    'Fully Paid', 'Does not meet the credit policy. Status:Fully Paid'
]

df['target'] = np.nan
df.loc[df['loan_status'].isin(good_statuses), 'target'] = 0
df.loc[df['loan_status'].isin(bad_statuses), 'target'] = 1
df_clean = df.dropna(subset=['target'])
df_clean = df_clean.astype({'target': int})

print(f"Data filtered to good/bad loans: {df_clean.shape}")

# 3. Define the RL components
FEATURES_TO_KEEP = [
    'loan_amnt', 'term', 'int_rate', 'annual_inc', 'dti', 'emp_length',
    'pub_rec', 'revol_util', 'total_acc', 'open_acc', 'mort_acc', 'fico_range_low'
]
TARGET_COL = 'target'

df_rl = df_clean[FEATURES_TO_KEEP + [TARGET_COL]].copy()


# --- 4. Engineer State, Action, Reward ---
# Clean categorical and fill missing
df_rl['term'] = df_rl['term'].str.replace(' months', '', regex=False).astype(int)
df_rl['emp_length'] = df_rl['emp_length'].str.replace('< 1 year', '0 years', regex=False)
df_rl['emp_length'] = df_rl['emp_length'].str.replace('10+ years', '10 years', regex=False)
df_rl['emp_length'] = df_rl['emp_length'].str.extract(r'(\d+)').astype(float)
df_rl = df_rl.fillna(df_rl.median())

# Load the scaler
scaler_path = '../models/scaler.joblib'
scaler = joblib.load(scaler_path)

# Create scaled observations (state)
observations = scaler.transform(df_rl[FEATURES_TO_KEEP])
print(f"States (observations) created and scaled: {observations.shape}")

# Create actions (always 1 for historical data)
actions = np.ones(len(df_rl), dtype=int)
print(f"Actions created (all 1s): {actions.shape}")

# Create rewards (raw dollar value)
profit = df_rl['loan_amnt'] * (df_rl['int_rate'] / 100.0)
loss = -df_rl['loan_amnt']
rewards = np.where(
    df_rl['target'] == 0, # Condition: Was the loan 'Fully Paid'?
    profit,               # If true: reward = profit
    loss                  # If false: reward = loss
)
print(f"Rewards engineered (raw $ value). Average reward: ${rewards.mean():.2f}")

# Create terminals (always 1 for this single-step problem)
terminals = np.ones(len(df_rl), dtype=int)
print("All RL arrays created.")

Original raw data loaded: (500000, 151)
Data filtered to good/bad loans: (391168, 152)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


States (observations) created and scaled: (391168, 12)
Actions created (all 1s): (391168,)
Rewards engineered (raw $ value). Average reward: $-1806.30
All RL arrays created.


In [3]:
# === Task 3, Cell 3: Split Arrays and Create Datasets ===

# 1. Split all four arrays at the same time
#    We use the scikit-learn train_test_split we imported in Cell 1
obs_train, obs_test, act_train, act_test, rew_train, rew_test, term_train, term_test = train_test_split(
    observations, 
    actions, 
    rewards,  # Use the raw (unscaled) rewards
    terminals, 
    test_size=0.2, 
    random_state=42
)

print("--- RL Arrays Split ---")

# 2. Now, create two separate MDPDataset objects
train_dataset = MDPDataset(
    observations=obs_train,
    actions=act_train,
    rewards=rew_train,
    terminals=term_train
)

test_dataset = MDPDataset(
    observations=obs_test,
    actions=act_test,
    rewards=rew_test,
    terminals=term_test
)

print("--- MDPDataset Objects Created ---")
# --- FIXED ---
# We must use the .size() method, not len()
print(f"Training dataset size: {train_dataset.size()}")
print(f"Test dataset size: {test_dataset.size()}")

--- RL Arrays Split ---
2025-10-30 14:37.42 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float64')], shape=[(12,)]) reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)])
2025-10-30 14:37.42 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-10-30 14:37.45 [info     ] Action size has been automatically determined. action_size=2
2025-10-30 14:37.46 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float64')], shape=[(12,)]) reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)])
2025-10-30 14:37.46 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-10-30 14:37.47 [info     ] Action size has been automatically determined

In [4]:
# === Task 3, Cell 4: Define the RL Algorithm (Discrete CQL) ===
from d3rlpy.preprocessing import MinMaxRewardScaler # Make sure scaler is imported

# 1. Set up the configuration for the Discrete CQL algorithm.
#    Hyperparameters like learning rate and reward scaling are set here.
cql_config = DiscreteCQLConfig(
    reward_scaler=MinMaxRewardScaler(), # Use the imported class instance
    learning_rate=3e-4                  # Optimizer learning rate
)

# 2. Initialize the Discrete CQL algorithm object using the configuration.
#    Pass use_gpu to determine if it should run on CPU or GPU.
cql = cql_config.create(device=use_gpu)

print("--- Discrete CQL Algorithm Initialized ---")
print(f"Algorithm: {cql.__class__.__name__}")
print(f"Using GPU: {use_gpu}")

--- Discrete CQL Algorithm Initialized ---
Algorithm: DiscreteCQL
Using GPU: False


In [5]:
# === Task 3, Cell 5: Train the RL Agent ===

print("--- Starting RL Model Training ---")
print("This will take several minutes. A progress bar will appear.")

# Determine the number of training steps (equivalent to 1 epoch).
n_steps_for_one_epoch = train_dataset.size()

print(f"Training for {n_steps_for_one_epoch} steps (equivalent to 1 epoch)...")

# Train the CQL model using the 'n_steps' argument.
cql.fit(
    train_dataset,
    n_steps=n_steps_for_one_epoch
)

print("\n--- RL Model Training Complete ---")

--- Starting RL Model Training ---
This will take several minutes. A progress bar will appear.
Training for 312934 steps (equivalent to 1 epoch)...
2025-10-30 14:38.11 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float64')], shape=[(12,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-10-30 14:38.11 [debug    ] Fitting reward scaler...       reward_scaler=min_max
2025-10-30 14:38.17 [debug    ] Building models...            
2025-10-30 14:38.21 [debug    ] Models have been built.       
2025-10-30 14:38.21 [info     ] Directory is created at d3rlpy_logs\DiscreteCQL_20251030143821
2025-10-30 14:38.21 [info     ] Parameters                     params={'observation_shape': [12], 'action_size': 2, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 32, 'gamma': 0.99, 

Epoch 1/31: 100%|█| 10000/10000 [01:52<00:00, 89.28it/s, loss=0.0142, td_loss=0.0126, conse


2025-10-30 14:40.13 [info     ] DiscreteCQL_20251030143821: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0019885479211807252, 'time_algorithm_update': 0.008737970757484436, 'loss': 0.014151464512199164, 'td_loss': 0.01256833084761165, 'conservative_loss': 0.0015831336646922865, 'time_step': 0.011059957265853881} step=10000
2025-10-30 14:40.13 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_10000.d3


Epoch 2/31: 100%|█| 10000/10000 [02:05<00:00, 79.68it/s, loss=0.0113, td_loss=0.0113, conse


2025-10-30 14:42.19 [info     ] DiscreteCQL_20251030143821: epoch=2 step=20000 epoch=2 metrics={'time_sample_batch': 0.002066092586517334, 'time_algorithm_update': 0.00999345896244049, 'loss': 0.01126854825859191, 'td_loss': 0.011267116616491693, 'conservative_loss': 1.4316409826278687e-06, 'time_step': 0.012415913915634156} step=20000
2025-10-30 14:42.19 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_20000.d3


Epoch 3/31: 100%|█| 10000/10000 [02:14<00:00, 74.54it/s, loss=0.0111, td_loss=0.0111, conse


2025-10-30 14:44.33 [info     ] DiscreteCQL_20251030143821: epoch=3 step=30000 epoch=3 metrics={'time_sample_batch': 0.002253847599029541, 'time_algorithm_update': 0.010597265148162841, 'loss': 0.011108649484999478, 'td_loss': 0.011108607046864926, 'conservative_loss': 4.243813455104828e-08, 'time_step': 0.01323072190284729} step=30000
2025-10-30 14:44.33 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_30000.d3


Epoch 4/31: 100%|█| 10000/10000 [02:17<00:00, 72.58it/s, loss=0.011, td_loss=0.011, conserv


2025-10-30 14:46.51 [info     ] DiscreteCQL_20251030143821: epoch=4 step=40000 epoch=4 metrics={'time_sample_batch': 0.0021469547986984255, 'time_algorithm_update': 0.011096542024612428, 'loss': 0.010987036012427415, 'td_loss': 0.010987024319393095, 'conservative_loss': 1.1693034321069718e-08, 'time_step': 0.01359628303050995} step=40000
2025-10-30 14:46.51 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_40000.d3


Epoch 5/31: 100%|█| 10000/10000 [02:19<00:00, 71.58it/s, loss=0.011, td_loss=0.011, conserv


2025-10-30 14:49.11 [info     ] DiscreteCQL_20251030143821: epoch=5 step=50000 epoch=5 metrics={'time_sample_batch': 0.0019466262340545654, 'time_algorithm_update': 0.011480566072463989, 'loss': 0.01102556249165209, 'td_loss': 0.011024031544488389, 'conservative_loss': 1.5309471637010575e-06, 'time_step': 0.013775156474113464} step=50000
2025-10-30 14:49.11 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_50000.d3


Epoch 6/31: 100%|█| 10000/10000 [02:23<00:00, 69.89it/s, loss=0.0109, td_loss=0.0109, conse


2025-10-30 14:51.34 [info     ] DiscreteCQL_20251030143821: epoch=6 step=60000 epoch=6 metrics={'time_sample_batch': 0.0019865514278411866, 'time_algorithm_update': 0.011778147673606873, 'loss': 0.010884934910514857, 'td_loss': 0.010884094984491822, 'conservative_loss': 8.399260230362415e-07, 'time_step': 0.014125293946266174} step=60000
2025-10-30 14:51.34 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_60000.d3


Epoch 7/31: 100%|█| 10000/10000 [02:28<00:00, 67.53it/s, loss=0.011, td_loss=0.011, conserv


2025-10-30 14:54.02 [info     ] DiscreteCQL_20251030143821: epoch=7 step=70000 epoch=7 metrics={'time_sample_batch': 0.002113432240486145, 'time_algorithm_update': 0.012049941992759704, 'loss': 0.011025890055147465, 'td_loss': 0.011025865630002227, 'conservative_loss': 2.4425145238637923e-08, 'time_step': 0.014562491917610169} step=70000
2025-10-30 14:54.02 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_70000.d3


Epoch 8/31: 100%|█| 10000/10000 [02:39<00:00, 62.54it/s, loss=0.011, td_loss=0.011, conserv


2025-10-30 14:56.42 [info     ] DiscreteCQL_20251030143821: epoch=8 step=80000 epoch=8 metrics={'time_sample_batch': 0.002140361022949219, 'time_algorithm_update': 0.013236055302619934, 'loss': 0.011016748227120843, 'td_loss': 0.011016739375737961, 'conservative_loss': 8.851289749145507e-09, 'time_step': 0.01574917254447937} step=80000
2025-10-30 14:56.42 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_80000.d3


Epoch 9/31: 100%|█| 10000/10000 [02:49<00:00, 59.09it/s, loss=0.011, td_loss=0.011, conserv


2025-10-30 14:59.31 [info     ] DiscreteCQL_20251030143821: epoch=9 step=90000 epoch=9 metrics={'time_sample_batch': 0.0022080488204956053, 'time_algorithm_update': 0.014048666834831238, 'loss': 0.010970864509243984, 'td_loss': 0.010970068306301255, 'conservative_loss': 7.962029427289962e-07, 'time_step': 0.016640340971946718} step=90000
2025-10-30 14:59.31 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_90000.d3


Epoch 10/31: 100%|█| 10000/10000 [02:52<00:00, 57.95it/s, loss=0.011, td_loss=0.011, conser


2025-10-30 15:02.24 [info     ] DiscreteCQL_20251030143821: epoch=10 step=100000 epoch=10 metrics={'time_sample_batch': 0.0021918574333190917, 'time_algorithm_update': 0.014387935471534729, 'loss': 0.010965255857154261, 'td_loss': 0.010964680650259833, 'conservative_loss': 5.752068944275379e-07, 'time_step': 0.01696174168586731} step=100000
2025-10-30 15:02.24 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_100000.d3


Epoch 11/31: 100%|█| 10000/10000 [02:53<00:00, 57.76it/s, loss=0.011, td_loss=0.011, conser


2025-10-30 15:05.17 [info     ] DiscreteCQL_20251030143821: epoch=11 step=110000 epoch=11 metrics={'time_sample_batch': 0.0021838227272033693, 'time_algorithm_update': 0.014445441722869873, 'loss': 0.011010273692617192, 'td_loss': 0.011009726495901123, 'conservative_loss': 5.471967160701752e-07, 'time_step': 0.017004305386543274} step=110000
2025-10-30 15:05.17 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_110000.d3


Epoch 12/31: 100%|█| 10000/10000 [02:54<00:00, 57.41it/s, loss=0.011, td_loss=0.011, conser


2025-10-30 15:08.11 [info     ] DiscreteCQL_20251030143821: epoch=12 step=120000 epoch=12 metrics={'time_sample_batch': 0.0021758777856826783, 'time_algorithm_update': 0.014557080054283142, 'loss': 0.010989830236556008, 'td_loss': 0.010989671454532071, 'conservative_loss': 1.587820239365101e-07, 'time_step': 0.01710931227207184} step=120000
2025-10-30 15:08.11 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_120000.d3


Epoch 13/31: 100%|█| 10000/10000 [02:56<00:00, 56.76it/s, loss=0.0109, td_loss=0.0109, cons


2025-10-30 15:11.07 [info     ] DiscreteCQL_20251030143821: epoch=13 step=130000 epoch=13 metrics={'time_sample_batch': 0.0019523488759994507, 'time_algorithm_update': 0.015020812940597534, 'loss': 0.010870886588143185, 'td_loss': 0.010870453623915092, 'conservative_loss': 4.3296413496136667e-07, 'time_step': 0.017334042501449584} step=130000
2025-10-30 15:11.08 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_130000.d3


Epoch 14/31: 100%|█| 10000/10000 [02:57<00:00, 56.32it/s, loss=0.0109, td_loss=0.0109, cons


2025-10-30 15:14.05 [info     ] DiscreteCQL_20251030143821: epoch=14 step=140000 epoch=14 metrics={'time_sample_batch': 0.001889369225502014, 'time_algorithm_update': 0.015205856370925903, 'loss': 0.010883845539100002, 'td_loss': 0.010883799071318936, 'conservative_loss': 4.64677345007658e-08, 'time_step': 0.017459387016296388} step=140000
2025-10-30 15:14.05 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_140000.d3


Epoch 15/31: 100%|█| 10000/10000 [02:57<00:00, 56.29it/s, loss=0.0109, td_loss=0.0109, cons


2025-10-30 15:17.03 [info     ] DiscreteCQL_20251030143821: epoch=15 step=150000 epoch=15 metrics={'time_sample_batch': 0.001900377345085144, 'time_algorithm_update': 0.015172403478622436, 'loss': 0.010871854192949832, 'td_loss': 0.010871690327394753, 'conservative_loss': 1.6386564821004867e-07, 'time_step': 0.017466281509399415} step=150000
2025-10-30 15:17.03 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_150000.d3


Epoch 16/31: 100%|█| 10000/10000 [02:56<00:00, 56.62it/s, loss=0.0109, td_loss=0.0109, cons


2025-10-30 15:19.59 [info     ] DiscreteCQL_20251030143821: epoch=16 step=160000 epoch=16 metrics={'time_sample_batch': 0.0018739722490310668, 'time_algorithm_update': 0.015090173435211181, 'loss': 0.01089986771212425, 'td_loss': 0.010899692837498151, 'conservative_loss': 1.7487471923232078e-07, 'time_step': 0.017334944438934328} step=160000
2025-10-30 15:20.00 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_160000.d3


Epoch 17/31: 100%|█| 10000/10000 [02:56<00:00, 56.63it/s, loss=0.0109, td_loss=0.0109, cons


2025-10-30 15:22.56 [info     ] DiscreteCQL_20251030143821: epoch=17 step=170000 epoch=17 metrics={'time_sample_batch': 0.0018377918481826782, 'time_algorithm_update': 0.015152265691757202, 'loss': 0.010911483610386494, 'td_loss': 0.010910822573548648, 'conservative_loss': 6.610367447137832e-07, 'time_step': 0.01733392403125763} step=170000
2025-10-30 15:22.56 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_170000.d3


Epoch 18/31: 100%|█| 10000/10000 [02:56<00:00, 56.65it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:25.53 [info     ] DiscreteCQL_20251030143821: epoch=18 step=180000 epoch=18 metrics={'time_sample_batch': 0.0018468263149261474, 'time_algorithm_update': 0.015165658521652222, 'loss': 0.010785902227903717, 'td_loss': 0.010784899641829543, 'conservative_loss': 1.0025860741734506e-06, 'time_step': 0.017377760195732117} step=180000
2025-10-30 15:25.53 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_180000.d3


Epoch 19/31: 100%|█| 10000/10000 [02:56<00:00, 56.57it/s, loss=0.0109, td_loss=0.0109, cons


2025-10-30 15:28.50 [info     ] DiscreteCQL_20251030143821: epoch=19 step=190000 epoch=19 metrics={'time_sample_batch': 0.0019036011695861817, 'time_algorithm_update': 0.015075558304786681, 'loss': 0.010871003318333532, 'td_loss': 0.010870270001736935, 'conservative_loss': 7.333167828619481e-07, 'time_step': 0.0173563472032547} step=190000
2025-10-30 15:28.50 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_190000.d3


Epoch 20/31: 100%|█| 10000/10000 [02:57<00:00, 56.41it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:31.47 [info     ] DiscreteCQL_20251030143821: epoch=20 step=200000 epoch=20 metrics={'time_sample_batch': 0.0018900550603866578, 'time_algorithm_update': 0.0152000812292099, 'loss': 0.010847490007826126, 'td_loss': 0.010847461417433805, 'conservative_loss': 2.8590299189090728e-08, 'time_step': 0.01745112612247467} step=200000
2025-10-30 15:31.47 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_200000.d3


Epoch 21/31: 100%|█| 10000/10000 [02:56<00:00, 56.53it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:34.44 [info     ] DiscreteCQL_20251030143821: epoch=21 step=210000 epoch=21 metrics={'time_sample_batch': 0.0019229011297225952, 'time_algorithm_update': 0.015083593320846557, 'loss': 0.01080768155505648, 'td_loss': 0.010807630998489913, 'conservative_loss': 5.0556729547679424e-08, 'time_step': 0.01737988932132721} step=210000
2025-10-30 15:34.44 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_210000.d3


Epoch 22/31: 100%|█| 10000/10000 [02:56<00:00, 56.53it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:37.41 [info     ] DiscreteCQL_20251030143821: epoch=22 step=220000 epoch=22 metrics={'time_sample_batch': 0.0019036140441894532, 'time_algorithm_update': 0.015111893486976623, 'loss': 0.01080801080762758, 'td_loss': 0.01080779718956328, 'conservative_loss': 2.1361836697906255e-07, 'time_step': 0.017404121565818786} step=220000
2025-10-30 15:37.41 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_220000.d3


Epoch 23/31: 100%|█| 10000/10000 [02:56<00:00, 56.55it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:40.38 [info     ] DiscreteCQL_20251030143821: epoch=23 step=230000 epoch=23 metrics={'time_sample_batch': 0.0018525955200195312, 'time_algorithm_update': 0.015170840096473693, 'loss': 0.01075941303629661, 'td_loss': 0.010759310315561016, 'conservative_loss': 1.027208287268877e-07, 'time_step': 0.01740434703826904} step=230000
2025-10-30 15:40.38 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_230000.d3


Epoch 24/31: 100%|█| 10000/10000 [02:56<00:00, 56.75it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:43.34 [info     ] DiscreteCQL_20251030143821: epoch=24 step=240000 epoch=24 metrics={'time_sample_batch': 0.0019217227935791016, 'time_algorithm_update': 0.015039593982696534, 'loss': 0.010757949696388096, 'td_loss': 0.010757600856153295, 'conservative_loss': 3.488402348011732e-07, 'time_step': 0.01733558979034424} step=240000
2025-10-30 15:43.34 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_240000.d3


Epoch 25/31: 100%|█| 10000/10000 [02:56<00:00, 56.74it/s, loss=0.0107, td_loss=0.0107, cons


2025-10-30 15:46.30 [info     ] DiscreteCQL_20251030143821: epoch=25 step=250000 epoch=25 metrics={'time_sample_batch': 0.0019383145093917846, 'time_algorithm_update': 0.015068482208251953, 'loss': 0.010718259445170407, 'td_loss': 0.010717820094816852, 'conservative_loss': 4.3935051653534176e-07, 'time_step': 0.017364423871040344} step=250000
2025-10-30 15:46.30 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_250000.d3


Epoch 26/31: 100%|█| 10000/10000 [02:56<00:00, 56.82it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:49.26 [info     ] DiscreteCQL_20251030143821: epoch=26 step=260000 epoch=26 metrics={'time_sample_batch': 0.0018841362476348877, 'time_algorithm_update': 0.01504609522819519, 'loss': 0.010787571688403842, 'td_loss': 0.01078700248192763, 'conservative_loss': 5.692065693438053e-07, 'time_step': 0.017313775253295898} step=260000
2025-10-30 15:49.26 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_260000.d3


Epoch 27/31: 100%|█| 10000/10000 [02:55<00:00, 57.00it/s, loss=0.0108, td_loss=0.0108, cons


2025-10-30 15:52.22 [info     ] DiscreteCQL_20251030143821: epoch=27 step=270000 epoch=27 metrics={'time_sample_batch': 0.0018383010864257813, 'time_algorithm_update': 0.015023188829421998, 'loss': 0.010769774765952025, 'td_loss': 0.010768420125369448, 'conservative_loss': 1.35464096092619e-06, 'time_step': 0.0172542275428772} step=270000
2025-10-30 15:52.22 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_270000.d3


Epoch 28/31: 100%|█| 10000/10000 [02:54<00:00, 57.43it/s, loss=0.0107, td_loss=0.0107, cons


2025-10-30 15:55.16 [info     ] DiscreteCQL_20251030143821: epoch=28 step=280000 epoch=28 metrics={'time_sample_batch': 0.0019049689531326294, 'time_algorithm_update': 0.01484040117263794, 'loss': 0.01072764468161622, 'td_loss': 0.010726463999936823, 'conservative_loss': 1.1806814465671777e-06, 'time_step': 0.0171163724899292} step=280000
2025-10-30 15:55.16 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_280000.d3


Epoch 29/31: 100%|█| 10000/10000 [02:54<00:00, 57.47it/s, loss=0.0107, td_loss=0.0107, cons


2025-10-30 15:58.10 [info     ] DiscreteCQL_20251030143821: epoch=29 step=290000 epoch=29 metrics={'time_sample_batch': 0.0018761574983596802, 'time_algorithm_update': 0.014860567116737366, 'loss': 0.010711822089995257, 'td_loss': 0.010711314086266794, 'conservative_loss': 5.080035247374326e-07, 'time_step': 0.017095691204071044} step=290000
2025-10-30 15:58.10 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_290000.d3


Epoch 30/31: 100%|█| 10000/10000 [02:53<00:00, 57.66it/s, loss=0.0107, td_loss=0.0107, cons


2025-10-30 16:01.03 [info     ] DiscreteCQL_20251030143821: epoch=30 step=300000 epoch=30 metrics={'time_sample_batch': 0.0019646296977996825, 'time_algorithm_update': 0.01470176112651825, 'loss': 0.010723373251478189, 'td_loss': 0.010722285219444893, 'conservative_loss': 1.0880318703129888e-06, 'time_step': 0.017041891741752625} step=300000
2025-10-30 16:01.04 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_300000.d3


Epoch 31/31: 100%|█| 10000/10000 [02:53<00:00, 57.76it/s, loss=0.0107, td_loss=0.0107, cons


2025-10-30 16:03.57 [info     ] DiscreteCQL_20251030143821: epoch=31 step=310000 epoch=31 metrics={'time_sample_batch': 0.0019162012100219727, 'time_algorithm_update': 0.014773324632644653, 'loss': 0.010719549461535643, 'td_loss': 0.010718521325092297, 'conservative_loss': 1.028136210516095e-06, 'time_step': 0.017038668990135192} step=310000
2025-10-30 16:03.57 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251030143821\model_310000.d3

--- RL Model Training Complete ---


In [None]:
# === Task 3, Cell 6: Evaluate (This is the correct, final version) ===

print("--- Starting Off-Policy Evaluation (DiscreteFQE) ---")

# 1. Create the generic FQEConfig object.
#    This assumes Cell 1 has imported FQEConfig.
fqe_config = FQEConfig(
    learning_rate=3e-4,
    reward_scaler=MinMaxRewardScaler()
)

# 2. Create DiscreteFQE, passing the trained algo and the config object.
fqe = DiscreteFQE(
    algo=cql,                  # Your trained CQL agent
    config=fqe_config,         # The config object
    device=use_gpu
)

# 3. Create the evaluator object to be passed in.
initial_state_evaluator = InitialStateValueEstimationEvaluator()

print("DiscreteFQE algorithm initialized. Training FQE model...")

# 4. Train the FQE model AND evaluate it at the same time.
#    The evaluator is passed in the 'evaluators' dictionary.
n_test_steps = test_dataset.size()
fqe.fit(
    test_dataset,
    n_steps=n_test_steps,
    evaluators={
        'policy_value': initial_state_evaluator # Pass the scorer HERE
    }
)

print("DiscreteFQE training complete.")

# 5. Get the final "Estimated Policy Value" from the log history.
#    The fit process saves the result in fqe.log_history
policy_value = fqe.log_history[-1]['policy_value']

print("\n--- RL Model Evaluation Complete ---")
# Get the raw historical average reward
avg_historical_reward_raw = test_dataset.rewards.mean()

# Unscale the policy value
min_reward = rewards.min() # 'rewards' is the unscaled array from Cell 2
max_reward = rewards.max()
policy_value_unscaled = policy_value * (max_reward - min_reward) + min_reward

print(f"   Estimated Policy Value (Unscaled $): ${policy_value_unscaled:.2f}")
print(f"Average Historical Reward (Raw $):    ${avg_historical_reward_raw:.2f}")

--- Starting Off-Policy Evaluation (DiscreteFQE) ---
DiscreteFQE algorithm initialized. Training FQE model...
2025-10-30 16:34.21 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float64')], shape=[(12,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-10-30 16:34.21 [debug    ] Fitting reward scaler...       reward_scaler=min_max
2025-10-30 16:34.22 [debug    ] Building models...            
2025-10-30 16:34.22 [debug    ] Models have been built.       
2025-10-30 16:34.22 [info     ] Directory is created at d3rlpy_logs\DiscreteFQE_20251030163422
2025-10-30 16:34.22 [info     ] Parameters                     params={'observation_shape': [12], 'action_size': 2, 'config': {'type': 'fqe', 'params': {'batch_size': 100, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params

Epoch 1/7: 100%|████████████████████████| 10000/10000 [03:27<00:00, 48.11it/s, loss=0.0111]


2025-10-30 16:40.27 [info     ] DiscreteFQE_20251030163422: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.005744779515266418, 'time_algorithm_update': 0.013789256811141968, 'loss': 0.011077759804530069, 'time_step': 0.020053118991851807, 'policy_value': 0.7486783862113953} step=10000
2025-10-30 16:40.27 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251030163422\model_10000.d3


Epoch 2/7: 100%|███████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:04<00:00, 155.64it/s, loss=0.0102]


2025-10-30 16:42.23 [info     ] DiscreteFQE_20251030163422: epoch=2 step=20000 epoch=2 metrics={'time_sample_batch': 0.0026896746635437013, 'time_algorithm_update': 0.003498509955406189, 'loss': 0.010233220964111387, 'time_step': 0.006355924654006958, 'policy_value': 0.7469315528869629} step=20000
2025-10-30 16:42.23 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251030163422\model_20000.d3


Epoch 3/7: 100%|██████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:05<00:00, 153.21it/s, loss=0.00937]


2025-10-30 16:44.20 [info     ] DiscreteFQE_20251030163422: epoch=3 step=30000 epoch=3 metrics={'time_sample_batch': 0.002661634063720703, 'time_algorithm_update': 0.0036179882526397706, 'loss': 0.00937036258103326, 'time_step': 0.006448856854438782, 'policy_value': 0.7373618483543396} step=30000
2025-10-30 16:44.20 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteFQE_20251030163422\model_30000.d3


Epoch 4/7:   9%|████████▍                                                                                | 944/10000 [00:24<03:46, 39.99it/s, loss=0.00891]