In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import sys

sys.path.append("/code")

from tqdm import tqdm
import torch
# device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# import gym
# import recogym

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

torch.backends.cudnn.benchmark = torch.cuda.is_available()
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")  # TF32 = big speedup on Ada


from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
import optuna
# from memory_profiler import profile


from estimators import (
    DirectMethod as DM
)

from simulation_utils import (
    eval_policy,
    generate_dataset,
    create_simulation_data_from_pi,
    get_train_data,
    get_opl_results_dict,
    CustomCFDataset,
    calc_reward
)

from models import (    
    CFModel,
    NeighborhoodModel,
    BPRModel, 
    RegressionModel
)

from training_utils import (
    fit_bpr,
    train,
    validation_loop
 )

from custom_losses import (
    SNDRPolicyLoss,
    BPRLoss
    )

random_state=12345
random_ = check_random_state(random_state)

pd.options.display.float_format = '{:,.8f}'.format

Using device: cuda
Using device: cuda
Using device: cuda


## `trainer_trial` Function

This function runs policy learning experiments using offline bandit data and evaluates various estimators.

### Parameters
- **num_runs** (int): Number of experimental runs per training size
- **num_neighbors** (int): Number of neighbors to consider in the neighborhood model
- **num_rounds_list** (list): List of training set sizes to evaluate
- **dataset** (dict): Contains dataset information including embeddings, action probabilities, and reward probabilities
- **batch_size** (int): Batch size for training the policy model
- **num_epochs** (int): Number of training epochs for each experiment
- **lr** (float, default=0.001): Learning rate for the optimizer

### Process Flow
1. Initializes result structures and retrieval models
2. For each training size in `num_rounds_list`:
   - Creates a uniform logging policy and simulates data
   - Generates training data for offline learning
   - Fits regression and neighborhood models for reward estimation
   - Initializes and trains a counterfactual policy model
   - Evaluates policy performance using various estimators
   - Collects metrics on policy reward and embedding quality

### Returns
- **DataFrame**: Results table with rows indexed by training size and columns for various metrics:
  - `policy_rewards`: True expected reward of the learned policy
  - Various estimator errors (`ipw`, `reg_dm`, `conv_dm`, `conv_dr`, `conv_sndr`)
  - Variance metrics for each estimator
  - Embedding quality metrics comparing learned representations to ground truth

### Implementation Notes
- Uses uniform random logging policy for collecting offline data
- Employs Self-Normalized Doubly Robust (SNDR) policy learning
- Measures embedding quality via RMSE to original/ground truth embeddings

In [2]:
def trainer_trial(
    num_runs,
    num_neighbors,
    num_rounds_list,
    dataset,
    batch_size,
    val_size=2000,
    n_trials=10,    
    prev_best_params=None
):
    import torch
    from torch.utils.data import DataLoader

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = torch.cuda.is_available()
    if torch.cuda.is_available():
        torch.set_float32_matmul_precision("high")

    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]
    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]

    all_user_indices = np.arange(n_users, dtype=np.int64)

    def T(x):
        return torch.as_tensor(x, device=device, dtype=torch.float32)

    best_hyperparams_by_size = {}
    best_reward = -float('inf')
    overall_best_params = {}

    last_best_params = prev_best_params if prev_best_params is not None else None

     # ---- Add baseline row for sample size = 0 ----
    pi_0 = softmax(our_x @ our_a.T, axis=1)
    original_policy_prob = np.expand_dims(pi_0, -1)
    # Use a dummy simulation for baseline
    simulation_data = create_simulation_data_from_pi(
        dataset, pi_0, val_size, random_state=0
    )

    train_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x)
    val_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x)

    regression_model = RegressionModel(
        n_actions=n_actions, action_context=our_x,
        base_model=LogisticRegression(random_state=12345)
    )
    regression_model.fit(train_data['x'], train_data['a'], train_data['r'])

    neighberhoodmodel = NeighborhoodModel(
        train_data['x_idx'], train_data['a'],
        our_a, our_x, train_data['r'],
        num_neighbors=num_neighbors
    )
    scores_all = torch.as_tensor(
        neighberhoodmodel.predict(all_user_indices),
        device=device, dtype=torch.float32
    )
    model = CFModel(
        n_users, n_actions, emb_dim,
        initial_user_embeddings=T(our_x),
        initial_actions_embeddings=T(our_a)
    ).to(device)

    policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
    policy_reward = calc_reward(dataset, policy)
    eval_metrics = eval_policy(neighberhoodmodel, val_data, original_policy_prob, policy)
    action_diff_to_real = np.sqrt(np.mean((emb_a - our_a) ** 2))
    action_delta = np.sqrt(np.mean((original_a - our_a) ** 2))
    context_diff_to_real = np.sqrt(np.mean((emb_x - our_x) ** 2))
    context_delta = np.sqrt(np.mean((original_x - our_x) ** 2))

    row = np.concatenate([
        np.atleast_1d(policy_reward),
        np.atleast_1d(eval_metrics),
        np.atleast_1d(action_diff_to_real),
        np.atleast_1d(action_delta),
        np.atleast_1d(context_diff_to_real),
        np.atleast_1d(context_delta)
    ])
    reg_dm = dm.estimate_policy_value(policy[val_data['x_idx']], regression_model.predict(val_data['x']))
    reg_results = np.array([reg_dm])
    conv_results = np.array([row])
    results[0] = get_opl_results_dict(reg_results, conv_results)

    # ---- Main training size loop ----
    for train_size in num_rounds_list:
        # Generate initial data for Optuna search
        pi_0 = softmax(our_x @ our_a.T, axis=1)
        original_policy_prob = np.expand_dims(pi_0, -1)
        simulation_data = create_simulation_data_from_pi(
            dataset, pi_0, train_size + val_size,
            random_state=train_size
        )
        idx = np.arange(train_size)
        train_data = get_train_data(n_actions, train_size, simulation_data, idx, our_x)
        num_workers = 4 if torch.cuda.is_available() else 0
        cf_dataset = CustomCFDataset(
            train_data['x_idx'], train_data['a'], train_data['r'], original_policy_prob
        )

        # Define Optuna objective inside the loop so it can access train_data and cf_dataset
        def objective(trial):
            lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
            epochs = trial.suggest_int("num_epochs", 1, 10)
            trial_batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
            trial_num_neighbors = trial.suggest_int("num_neighbors", 3, 15)
            lr_decay = trial.suggest_float("lr_decay", 0.8, 1.0)

            trial_neigh_model = NeighborhoodModel(
                train_data['x_idx'], train_data['a'],
                our_a, our_x, train_data['r'],
                num_neighbors=trial_num_neighbors
            )
            trial_scores_all = torch.as_tensor(
                trial_neigh_model.predict(all_user_indices),
                device=device, dtype=torch.float32
            )
            trial_model = CFModel(
                n_users, n_actions, emb_dim,
                initial_user_embeddings=T(our_x),
                initial_actions_embeddings=T(our_a)
            ).to(device)
            assert (not torch.cuda.is_available()) or next(trial_model.parameters()).is_cuda

            final_train_loader = DataLoader(
                cf_dataset, batch_size=trial_batch_size, shuffle=True,
                pin_memory=torch.cuda.is_available(),
                num_workers=num_workers, persistent_workers=bool(num_workers)
            )

            current_lr = lr
            for epoch in range(epochs):
                if epoch > 0:
                    current_lr *= lr_decay
                train(
                    trial_model, final_train_loader, trial_neigh_model, trial_scores_all,
                    criterion=SNDRPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                )

            trial_x_t, trial_a_t = trial_model.get_params()
            trial_x = trial_x_t.detach().cpu().numpy()
            trial_a = trial_a_t.detach().cpu().numpy()
            trial_policy = np.expand_dims(softmax(trial_x @ trial_a.T, axis=1), -1)
            trial_policy_reward = calc_reward(dataset, trial_policy)
            return trial_policy_reward

        # ---- Hyperparam search ----
        study = optuna.create_study(direction="maximize")
        if last_best_params is not None:
            study.enqueue_trial(last_best_params)
        study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

        best_params = study.best_params
        best_reward_for_size = study.best_value
        best_hyperparams_by_size[train_size] = {
            "params": best_params,
            "reward": best_reward_for_size
        }
        last_best_params = best_params

        # ---- Final evaluation loop ----
        reg_results, conv_results = [], []
        for run in range(num_runs):
            pi_0 = softmax(our_x @ our_a.T, axis=1)
            original_policy_prob = np.expand_dims(pi_0, -1)
            simulation_data = create_simulation_data_from_pi(
                dataset, pi_0, train_size + val_size,
                random_state=(run + 1) * train_size
            )
            idx = np.arange(train_size)
            train_data = get_train_data(n_actions, train_size, simulation_data, idx, our_x)
            val_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size) + train_size, our_x)

            regression_model = RegressionModel(
                n_actions=n_actions, action_context=our_x,
                base_model=LogisticRegression(random_state=12345)
            )
            regression_model.fit(
                train_data['x'], train_data['a'], train_data['r'],
                original_policy_prob[train_data['x_idx'], train_data['a']].squeeze()
            )

            neighberhoodmodel = NeighborhoodModel(
                train_data['x_idx'], train_data['a'],
                our_a, our_x, train_data['r'],
                num_neighbors=best_params['num_neighbors']
            )
            scores_all = torch.as_tensor(
                neighberhoodmodel.predict(all_user_indices),
                device=device, dtype=torch.float32
            )
            model = CFModel(
                n_users, n_actions, emb_dim,
                initial_user_embeddings=T(our_x),
                initial_actions_embeddings=T(our_a)
            ).to(device)
            assert (not torch.cuda.is_available()) or next(model.parameters()).is_cuda

            cf_dataset = CustomCFDataset(
                train_data['x_idx'], train_data['a'], train_data['r'], original_policy_prob
            )
            train_loader = DataLoader(
                cf_dataset, batch_size=batch_size, shuffle=True,
                pin_memory=torch.cuda.is_available(),
                num_workers=num_workers, persistent_workers=bool(num_workers)
            )

            val_dataset = CustomCFDataset(
                val_data['x_idx'], val_data['a'], val_data['r'], original_policy_prob
            )
            val_loader = DataLoader(
                val_dataset, batch_size=val_size, shuffle=False,
                pin_memory=torch.cuda.is_available(),
                num_workers=num_workers, persistent_workers=bool(num_workers)
            )

            current_lr = best_params['lr']
            for epoch in range(best_params['num_epochs']):
                if epoch > 0:
                    current_lr *= best_params['lr_decay']
                train(
                    model, train_loader, neighberhoodmodel, scores_all,
                    criterion=SNDRPolicyLoss(),
                    num_epochs=1, lr=current_lr,
                    device=str(device)
                )

            our_x_t, our_a_t = model.get_params()
            our_a, our_x = our_a_t.detach().cpu().numpy(), our_x_t.detach().cpu().numpy()
            policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
            policy_reward = calc_reward(dataset, policy)
            eval_metrics = eval_policy(neighberhoodmodel, train_data, original_policy_prob, policy)
            action_diff_to_real = np.sqrt(np.mean((emb_a - our_a) ** 2))
            action_delta = np.sqrt(np.mean((original_a - our_a) ** 2))
            context_diff_to_real = np.sqrt(np.mean((emb_x - our_x) ** 2))
            context_delta = np.sqrt(np.mean((original_x - our_x) ** 2))

            row = np.concatenate([
                np.atleast_1d(policy_reward),
                np.atleast_1d(eval_metrics),
                np.atleast_1d(action_diff_to_real),
                np.atleast_1d(action_delta),
                np.atleast_1d(context_diff_to_real),
                np.atleast_1d(context_delta)
            ])
            conv_results.append(row)

        torch.cuda.empty_cache()
        reg_results = np.array(reg_results)
        conv_results = np.array(conv_results)
        results[train_size] = get_opl_results_dict(reg_results, conv_results)

    return pd.DataFrame.from_dict(results, orient='index'), best_hyperparams_by_size

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [3]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.2
                    )

train_dataset = generate_dataset(dataset_params)

Random Item CTR: 0.12972795060603162
Optimal greedy CTR: 0.19999707792821972
Optimal Stochastic CTR: 0.19982996880994605
Our Initial CTR: 0.1646085673501415


In [4]:
train_dataset.keys()

dict_keys(['emb_a', 'our_a', 'original_a', 'emb_x', 'our_x', 'original_x', 'q_x_a', 'n_actions', 'n_users', 'emb_dim', 'user_prior'])

In [5]:
num_runs = 2
batch_size = 200
num_neighbors = 6
n_trials_for_optuna = 20
num_rounds_list = [500, 1000] #, 5000, 10000]

# Manually define your best parameters
best_params_to_use = {
    "lr": 0.002,  # Learning rate
    "num_epochs": 5,  # Number of training epochs
    "batch_size": 256,  # Batch size for training
    "num_neighbors": 8,  # Number of neighbors for neighborhood model
    "lr_decay": 0.9  # Learning rate decay factor
}

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [6]:
print("Value of num_rounds_list:", num_rounds_list)

# Run the optimization
df4, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=2000, n_trials=n_trials_for_optuna,prev_best_params=best_params_to_use)

# Print best hyperparameters for each training size
print("\n=== BEST HYPERPARAMETERS BY TRAINING SIZE ===")
for train_size, params in best_hyperparams_by_size.items():
    print(f"\nTraining Size: {train_size}")
    print(f"Best Reward: {params['reward']:.6f}")
    print("Parameters:")
    for param_name, value in params['params'].items():
        print(f"  {param_name}: {value}")
print("===========================\n")

# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]



Value of num_rounds_list: [500, 1000]


[I 2025-08-21 00:02:56,760] A new study created in memory with name: no-name-1da3e89c-e2c1-46d2-bd0f-c0bb83fb27ee
Best trial: 0. Best value: 0.164715:   5%|▌         | 1/20 [00:03<00:58,  3.07s/it]

[I 2025-08-21 00:02:59,829] Trial 0 finished with value: 0.1647152159909 and parameters: {'lr': 0.002, 'num_epochs': 5, 'batch_size': 256, 'num_neighbors': 8, 'lr_decay': 0.9}. Best is trial 0 with value: 0.1647152159909.


Best trial: 0. Best value: 0.164715:  10%|█         | 2/20 [00:05<00:49,  2.75s/it]

[I 2025-08-21 00:03:02,362] Trial 1 finished with value: 0.1646274224154382 and parameters: {'lr': 0.00020811128190312416, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.9627563310517935}. Best is trial 0 with value: 0.1647152159909.


Best trial: 0. Best value: 0.164715:  15%|█▌        | 3/20 [00:08<00:45,  2.65s/it]

[I 2025-08-21 00:03:04,882] Trial 2 finished with value: 0.1646327034931041 and parameters: {'lr': 0.00029610228969303504, 'num_epochs': 4, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.8831152056434768}. Best is trial 0 with value: 0.1647152159909.


Best trial: 0. Best value: 0.164715:  20%|██        | 4/20 [00:11<00:49,  3.10s/it]

[I 2025-08-21 00:03:08,675] Trial 3 finished with value: 0.16464814359028 and parameters: {'lr': 0.0007364484774462089, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.8339882051168946}. Best is trial 0 with value: 0.1647152159909.


Best trial: 4. Best value: 0.165308:  25%|██▌       | 5/20 [00:14<00:44,  2.94s/it]

[I 2025-08-21 00:03:11,323] Trial 4 finished with value: 0.165308163919195 and parameters: {'lr': 0.0031687774077433523, 'num_epochs': 10, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.9117642707313017}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  30%|███       | 6/20 [00:17<00:38,  2.77s/it]

[I 2025-08-21 00:03:13,762] Trial 5 finished with value: 0.16461001229095254 and parameters: {'lr': 0.00012743651129106193, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 4, 'lr_decay': 0.9725071263587736}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  35%|███▌      | 7/20 [00:19<00:35,  2.71s/it]

[I 2025-08-21 00:03:16,364] Trial 6 finished with value: 0.16471065737444696 and parameters: {'lr': 0.0004096385404112414, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 14, 'lr_decay': 0.9836749695792514}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  40%|████      | 8/20 [00:22<00:31,  2.63s/it]

[I 2025-08-21 00:03:18,821] Trial 7 finished with value: 0.16461108213084957 and parameters: {'lr': 0.000250538546829754, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.8500475108143213}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  45%|████▌     | 9/20 [00:24<00:28,  2.61s/it]

[I 2025-08-21 00:03:21,368] Trial 8 finished with value: 0.16495179063307494 and parameters: {'lr': 0.005088326254713456, 'num_epochs': 5, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.9284784722078744}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  50%|█████     | 10/20 [00:27<00:26,  2.60s/it]

[I 2025-08-21 00:03:23,961] Trial 9 finished with value: 0.1646373785783598 and parameters: {'lr': 0.00024026327992702996, 'num_epochs': 6, 'batch_size': 128, 'num_neighbors': 15, 'lr_decay': 0.9470860644865813}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  55%|█████▌    | 11/20 [00:29<00:23,  2.61s/it]

[I 2025-08-21 00:03:26,573] Trial 10 finished with value: 0.1651151790586645 and parameters: {'lr': 0.00865131694973534, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.8658024592703468}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  60%|██████    | 12/20 [00:32<00:20,  2.59s/it]

[I 2025-08-21 00:03:29,144] Trial 11 finished with value: 0.1652399428074737 and parameters: {'lr': 0.009851900546672098, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.8754700661724174}. Best is trial 4 with value: 0.165308163919195.


Best trial: 4. Best value: 0.165308:  65%|██████▌   | 13/20 [00:35<00:18,  2.60s/it]

[I 2025-08-21 00:03:31,760] Trial 12 finished with value: 0.1647663458001018 and parameters: {'lr': 0.002617456646444264, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.9266703022961029}. Best is trial 4 with value: 0.165308163919195.


Best trial: 13. Best value: 0.166556:  70%|███████   | 14/20 [00:37<00:15,  2.61s/it]

[I 2025-08-21 00:03:34,380] Trial 13 finished with value: 0.1665564714492538 and parameters: {'lr': 0.009681918372888688, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8124880219948764}. Best is trial 13 with value: 0.1665564714492538.


Best trial: 13. Best value: 0.166556:  75%|███████▌  | 15/20 [00:40<00:12,  2.59s/it]

[I 2025-08-21 00:03:36,946] Trial 14 finished with value: 0.1649904197483568 and parameters: {'lr': 0.0031026426662942685, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 5, 'lr_decay': 0.8067089282094644}. Best is trial 13 with value: 0.1665564714492538.


Best trial: 13. Best value: 0.166556:  80%|████████  | 16/20 [00:43<00:11,  2.96s/it]

[I 2025-08-21 00:03:40,741] Trial 15 finished with value: 0.16473918306170815 and parameters: {'lr': 0.0011880159870345914, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8102490115202625}. Best is trial 13 with value: 0.1665564714492538.


Best trial: 13. Best value: 0.166556:  85%|████████▌ | 17/20 [00:46<00:08,  2.85s/it]

[I 2025-08-21 00:03:43,348] Trial 16 finished with value: 0.1657259297041133 and parameters: {'lr': 0.004884096835657089, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 3, 'lr_decay': 0.9098780431710485}. Best is trial 13 with value: 0.1665564714492538.


Best trial: 13. Best value: 0.166556:  90%|█████████ | 18/20 [00:49<00:05,  2.80s/it]

[I 2025-08-21 00:03:46,024] Trial 17 finished with value: 0.16635804513835278 and parameters: {'lr': 0.005259336695703555, 'num_epochs': 9, 'batch_size': 64, 'num_neighbors': 3, 'lr_decay': 0.8347363521961699}. Best is trial 13 with value: 0.1665564714492538.


Best trial: 13. Best value: 0.166556:  95%|█████████▌| 19/20 [00:51<00:02,  2.74s/it]

[I 2025-08-21 00:03:48,644] Trial 18 finished with value: 0.16623014300943686 and parameters: {'lr': 0.005485770914951094, 'num_epochs': 7, 'batch_size': 64, 'num_neighbors': 3, 'lr_decay': 0.8315209582655346}. Best is trial 13 with value: 0.1665564714492538.


Best trial: 13. Best value: 0.166556: 100%|██████████| 20/20 [00:54<00:00,  2.72s/it]


[I 2025-08-21 00:03:51,222] Trial 19 finished with value: 0.16476246437476835 and parameters: {'lr': 0.001303473762292282, 'num_epochs': 3, 'batch_size': 64, 'num_neighbors': 7, 'lr_decay': 0.8284944671309039}. Best is trial 13 with value: 0.1665564714492538.


[I 2025-08-21 00:03:55,335] A new study created in memory with name: no-name-136fcfba-f7b6-490e-a34b-0f8e7904b33d
Best trial: 0. Best value: 0.169047:   5%|▌         | 1/20 [00:03<00:57,  3.01s/it]

[I 2025-08-21 00:03:58,347] Trial 0 finished with value: 0.16904677940041388 and parameters: {'lr': 0.009681918372888688, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8124880219948764}. Best is trial 0 with value: 0.16904677940041388.


Best trial: 0. Best value: 0.169047:  10%|█         | 2/20 [00:05<00:53,  2.95s/it]

[I 2025-08-21 00:04:01,260] Trial 1 finished with value: 0.1676534878443323 and parameters: {'lr': 0.009773055353996451, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.9353732431489278}. Best is trial 0 with value: 0.16904677940041388.


Best trial: 0. Best value: 0.169047:  15%|█▌        | 3/20 [00:08<00:49,  2.91s/it]

[I 2025-08-21 00:04:04,113] Trial 2 finished with value: 0.16717954127352563 and parameters: {'lr': 0.004351365553044151, 'num_epochs': 4, 'batch_size': 128, 'num_neighbors': 8, 'lr_decay': 0.9630207755040227}. Best is trial 0 with value: 0.16904677940041388.


Best trial: 0. Best value: 0.169047:  20%|██        | 4/20 [00:11<00:46,  2.91s/it]

[I 2025-08-21 00:04:07,034] Trial 3 finished with value: 0.16690412333676105 and parameters: {'lr': 0.0013106535343548199, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 10, 'lr_decay': 0.8322305737508512}. Best is trial 0 with value: 0.16904677940041388.


Best trial: 0. Best value: 0.169047:  25%|██▌       | 5/20 [00:15<00:50,  3.35s/it]

[I 2025-08-21 00:04:11,152] Trial 4 finished with value: 0.16687322905444915 and parameters: {'lr': 0.0009669480244530776, 'num_epochs': 9, 'batch_size': 512, 'num_neighbors': 15, 'lr_decay': 0.8433822327164343}. Best is trial 0 with value: 0.16904677940041388.


Best trial: 5. Best value: 0.17054:  30%|███       | 6/20 [00:18<00:45,  3.26s/it] 

[I 2025-08-21 00:04:14,228] Trial 5 finished with value: 0.17054006579236203 and parameters: {'lr': 0.0072870095422992685, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.9650591972062041}. Best is trial 5 with value: 0.17054006579236203.


Best trial: 6. Best value: 0.171447:  35%|███▌      | 7/20 [00:22<00:41,  3.22s/it]

[I 2025-08-21 00:04:17,361] Trial 6 finished with value: 0.17144690158546425 and parameters: {'lr': 0.007213385884565424, 'num_epochs': 9, 'batch_size': 64, 'num_neighbors': 5, 'lr_decay': 0.8757335518577117}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  40%|████      | 8/20 [00:25<00:37,  3.15s/it]

[I 2025-08-21 00:04:20,379] Trial 7 finished with value: 0.16688617940939598 and parameters: {'lr': 0.00013438881151394257, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 15, 'lr_decay': 0.9698792007067164}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  45%|████▌     | 9/20 [00:27<00:33,  3.05s/it]

[I 2025-08-21 00:04:23,207] Trial 8 finished with value: 0.16687713466009757 and parameters: {'lr': 0.0013827241709540767, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.8222591203643104}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  50%|█████     | 10/20 [00:30<00:29,  2.98s/it]

[I 2025-08-21 00:04:26,013] Trial 9 finished with value: 0.16689054543713155 and parameters: {'lr': 0.00255056035290449, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 8, 'lr_decay': 0.8435150940324251}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  55%|█████▌    | 11/20 [00:33<00:26,  2.90s/it]

[I 2025-08-21 00:04:28,753] Trial 10 finished with value: 0.16688398815341146 and parameters: {'lr': 0.00033684774860401665, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 3, 'lr_decay': 0.8881159495586852}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  60%|██████    | 12/20 [00:36<00:23,  2.89s/it]

[I 2025-08-21 00:04:31,597] Trial 11 finished with value: 0.16747801062221057 and parameters: {'lr': 0.004357321801325476, 'num_epochs': 3, 'batch_size': 64, 'num_neighbors': 4, 'lr_decay': 0.8983465908981283}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  65%|██████▌   | 13/20 [00:39<00:20,  2.94s/it]

[I 2025-08-21 00:04:34,651] Trial 12 finished with value: 0.16892621406211028 and parameters: {'lr': 0.004318475711603302, 'num_epochs': 7, 'batch_size': 64, 'num_neighbors': 5, 'lr_decay': 0.9322575580774413}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  70%|███████   | 14/20 [00:42<00:17,  2.94s/it]

[I 2025-08-21 00:04:37,608] Trial 13 finished with value: 0.16829960348887513 and parameters: {'lr': 0.009998338171459962, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 12, 'lr_decay': 0.8724327398080602}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  75%|███████▌  | 15/20 [00:45<00:14,  2.93s/it]

[I 2025-08-21 00:04:40,494] Trial 14 finished with value: 0.16691242693163796 and parameters: {'lr': 0.00046924508013633314, 'num_epochs': 5, 'batch_size': 128, 'num_neighbors': 5, 'lr_decay': 0.987133488227082}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  80%|████████  | 16/20 [00:49<00:13,  3.34s/it]

[I 2025-08-21 00:04:44,810] Trial 15 finished with value: 0.16754333069779678 and parameters: {'lr': 0.002493489030982789, 'num_epochs': 7, 'batch_size': 64, 'num_neighbors': 13, 'lr_decay': 0.9291465988503248}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  85%|████████▌ | 17/20 [00:52<00:09,  3.25s/it]

[I 2025-08-21 00:04:47,829] Trial 16 finished with value: 0.16797458719844147 and parameters: {'lr': 0.0058813229741217395, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 6, 'lr_decay': 0.8688608044705155}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  90%|█████████ | 18/20 [00:55<00:06,  3.23s/it]

[I 2025-08-21 00:04:51,029] Trial 17 finished with value: 0.16788977993947404 and parameters: {'lr': 0.0022847416882829956, 'num_epochs': 10, 'batch_size': 64, 'num_neighbors': 9, 'lr_decay': 0.9206237369199889}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447:  95%|█████████▌| 19/20 [00:58<00:03,  3.13s/it]

[I 2025-08-21 00:04:53,909] Trial 18 finished with value: 0.1668890853127018 and parameters: {'lr': 0.0006270253671676676, 'num_epochs': 5, 'batch_size': 256, 'num_neighbors': 10, 'lr_decay': 0.9569099411109969}. Best is trial 6 with value: 0.17144690158546425.


Best trial: 6. Best value: 0.171447: 100%|██████████| 20/20 [01:01<00:00,  3.08s/it]


[I 2025-08-21 00:04:56,874] Trial 19 finished with value: 0.16688721176002486 and parameters: {'lr': 0.00016823385111369444, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 4, 'lr_decay': 0.9948149158035802}. Best is trial 6 with value: 0.17144690158546425.

=== BEST HYPERPARAMETERS BY TRAINING SIZE ===

Training Size: 500
Best Reward: 0.166556
Parameters:
  lr: 0.009681918372888688
  num_epochs: 9
  batch_size: 128
  num_neighbors: 11
  lr_decay: 0.8124880219948764

Training Size: 1000
Best Reward: 0.171447
Parameters:
  lr: 0.007213385884565424
  num_epochs: 9
  batch_size: 64
  num_neighbors: 5
  lr_decay: 0.8757335518577117



Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.16460857,0.18648377,0.17616646,0.18441898,0.18191963,0.17284784,0.7569287,0.0,0.87627132,0.0
500,0.16634903,0.20744115,,0.17381334,0.17509877,0.18550551,0.76442025,0.13951846,0.87814654,0.05867832
1000,0.16852269,0.20308214,,0.16935521,0.17195164,0.18671154,0.79439417,0.30102437,0.88687834,0.10292054
