In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import sys

sys.path.append("/code")

from tqdm import tqdm
import torch
# device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# import gym
# import recogym

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

torch.backends.cudnn.benchmark = torch.cuda.is_available()
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")  # TF32 = big speedup on Ada


from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
import optuna
# from memory_profiler import profile


from estimators import (
    DirectMethod as DM
)

from simulation_utils import (
    eval_policy,
    generate_dataset,
    create_simulation_data_from_pi,
    get_train_data,
    get_opl_results_dict,
    CustomCFDataset,
    calc_reward,
    get_weights_info
)

from models import (    
    LinearCFModel,
    NeighborhoodModel,
    BPRModel, 
    RegressionModel
)

from training_utils import (
    train,
    validation_loop, 
    cv_score_model
 )

from custom_losses import (
    SNDRPolicyLoss,
    IPWPolicyLoss, 
    KLPolicyLoss
    )

random_state=12345
random_ = check_random_state(random_state)

pd.options.display.float_format = '{:,.8f}'.format

Using device: cuda
Using device: cuda
Using device: cuda


In [2]:
def get_trial_results(
    our_x, 
    our_a, 
    emb_x, 
    emb_a, 
    original_x, 
    original_a, 
    dataset, 
    val_data, 
    original_policy_prob, 
    neighberhoodmodel, 
    regression_model, 
    dm
):
    policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
    policy_reward = calc_reward(dataset, policy)
    eval_metrics = eval_policy(neighberhoodmodel, val_data, original_policy_prob, policy)
    action_diff_to_real = np.sqrt(np.mean((emb_a - our_a) ** 2))
    action_delta = np.sqrt(np.mean((original_a - our_a) ** 2))
    context_diff_to_real = np.sqrt(np.mean((emb_x - our_x) ** 2))
    context_delta = np.sqrt(np.mean((original_x - our_x) ** 2))

    row = np.concatenate([
        np.atleast_1d(policy_reward),
        np.atleast_1d(eval_metrics),
        np.atleast_1d(action_diff_to_real),
        np.atleast_1d(action_delta),
        np.atleast_1d(context_diff_to_real),
        np.atleast_1d(context_delta)
    ])
    reg_dm = dm.estimate_policy_value(policy[val_data['x_idx']], regression_model.predict(val_data['x']))
    reg_results = np.array([reg_dm])
    conv_results = np.array([row])
    return get_opl_results_dict(reg_results, conv_results)

## `trainer_trial` Function

This function runs policy learning experiments using offline bandit data and evaluates various estimators.

### Parameters
- **num_runs** (int): Number of experimental runs per training size
- **num_neighbors** (int): Number of neighbors to consider in the neighborhood model
- **num_rounds_list** (list): List of training set sizes to evaluate
- **dataset** (dict): Contains dataset information including embeddings, action probabilities, and reward probabilities
- **batch_size** (int): Batch size for training the policy model
- **num_epochs** (int): Number of training epochs for each experiment
- **lr** (float, default=0.001): Learning rate for the optimizer

### Process Flow
1. Initializes result structures and retrieval models
2. For each training size in `num_rounds_list`:
   - Creates a uniform logging policy and simulates data
   - Generates training data for offline learning
   - Fits regression and neighborhood models for reward estimation
   - Initializes and trains a counterfactual policy model
   - Evaluates policy performance using various estimators
   - Collects metrics on policy reward and embedding quality

### Returns
- **DataFrame**: Results table with rows indexed by training size and columns for various metrics:
  - `policy_rewards`: True expected reward of the learned policy
  - Various estimator errors (`ipw`, `reg_dm`, `conv_dm`, `conv_dr`, `conv_sndr`)
  - Variance metrics for each estimator
  - Embedding quality metrics comparing learned representations to ground truth

### Implementation Notes
- Uses uniform random logging policy for collecting offline data
- Employs Self-Normalized Doubly Robust (SNDR) policy learning
- Measures embedding quality via RMSE to original/ground truth embeddings

In [3]:
def trainer_trial(
    num_runs,
    num_neighbors,
    train_sizes,
    dataset,
    batch_size,
    val_size=2000,
    n_trials=10,    
    prev_best_params=None
):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = torch.cuda.is_available()
    if torch.cuda.is_available():
        torch.set_float32_matmul_precision("high")

    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]

    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]

    all_user_indices = np.arange(n_users, dtype=np.int64)

    def T(x):
        return torch.as_tensor(x, device=device, dtype=torch.float32)

    def _mean_dict(dicts):
        """
        Robust mean over a list of dicts with numeric/scalar/1D-array values.
        Returns a single dict with elementwise means.
        """
        if not dicts:
            return {}
        keys = dicts[0].keys()
        out = {}
        for k in keys:
            vals = [d[k] for d in dicts if k in d]
            # try to convert each to np.array and average
            arrs = [np.asarray(v) for v in vals]
            # broadcast to same shape if scalars/1D
            stacked = np.stack(arrs, axis=0)
            out[k] = np.mean(stacked, axis=0)
        return out

    # ===== unpack dataset (keep originals safe) =====
    our_x_orig, our_a_orig = our_x, our_a
    emb_x, emb_a = emb_x, emb_a
    original_x, original_a = original_x, original_a
    n_users, n_actions, emb_dim = n_users, n_actions, emb_dim
    all_user_indices = np.arange(n_users, dtype=np.int64)

    dm = DM()
    results = {}
    best_hyperparams_by_size = {}
    last_best_params = prev_best_params if prev_best_params is not None else None

    # ===== baseline (sample size = 0) using get_trial_results =====
    pi_0 = softmax(our_x_orig @ our_a_orig.T, axis=1)
    original_policy_prob = np.expand_dims(pi_0, -1)

    simulation_data = create_simulation_data_from_pi(
        dataset, pi_0, val_size, random_state=0
    )

    # use same data for train/val just to generate the baseline row
    train_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x_orig)
    val_data   = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x_orig)

    regression_model = RegressionModel(
        n_actions=n_actions, action_context=our_x_orig,
        base_model=LogisticRegression(random_state=12345)
    )

    regression_model.fit(train_data['x'], train_data['a'], train_data['r'])

    neighberhoodmodel = NeighborhoodModel(
        train_data['x_idx'], train_data['a'],
        our_a_orig, our_x_orig, train_data['r'],
        num_neighbors=num_neighbors
    )

    # baseline row produced via get_trial_results
    results[0] = get_trial_results(
        our_x_orig, our_a_orig, emb_x, emb_a, original_x, original_a,
        dataset, val_data, original_policy_prob,
        neighberhoodmodel, regression_model, dm
    )

    # ===== main loop over training sizes =====
    for train_size in train_sizes:

        # we’ll collect per-run trial dicts generated by get_trial_results
        trial_dicts_this_size = []
        best_hyperparams_by_size[train_size] = {}

        # --- prepare a resampling for Optuna’s objective (shared loaders built per-run inside objective) ---
        # We’ll do Optuna per-run (fresh resample + search), then final fit with best params, then get_trial_results.

        for run in range(num_runs):

            # --- resample for this run ---
            pi_0 = softmax(our_x_orig @ our_a_orig.T, axis=1)
            original_policy_prob = np.expand_dims(pi_0, -1)

            simulation_data = create_simulation_data_from_pi(
                dataset, pi_0, train_size + val_size,
                random_state=(run + 1) * (train_size + 17)
            )

            idx_train = np.arange(train_size)
            train_data = get_train_data(n_actions, train_size, simulation_data, idx_train, our_x_orig)
            val_idx   = np.arange(val_size) + train_size
            val_data  = get_train_data(n_actions, val_size, simulation_data, val_idx, our_x_orig)

            num_workers = 4 if torch.cuda.is_available() else 0

            cf_dataset = CustomCFDataset(
                train_data['x_idx'], train_data['a'], train_data['r'], original_policy_prob
            )

            # val_loader = DataLoader(
            #     val_dataset, batch_size=val_size, shuffle=False,
            #     pin_memory=torch.cuda.is_available(),
            #     num_workers=num_workers, persistent_workers=bool(num_workers)
            # )


            # --- Optuna objective bound to this run's data ---
            def objective(trial):
                print()
                print(f"Trial {trial.number} started")
                lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
                epochs = trial.suggest_int("num_epochs", 1, 10)
                trial_batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
                trial_num_neighbors = trial.suggest_int("num_neighbors", 3, 15)
                lr_decay = trial.suggest_float("lr_decay", 0.8, 1.0)

                trial_neigh_model = NeighborhoodModel(
                    train_data['x_idx'], train_data['a'],
                    our_a_orig, our_x_orig, train_data['r'],
                    num_neighbors=trial_num_neighbors
                )

                trial_scores_all = torch.as_tensor(
                    trial_neigh_model.predict(all_user_indices),
                    device=device, dtype=torch.float32
                )

                trial_model = LinearCFModel(
                    n_users, n_actions, emb_dim,
                    initial_user_embeddings=T(our_x_orig),
                    initial_actions_embeddings=T(our_a_orig)
                ).to(device)

                assert (not torch.cuda.is_available()) or next(trial_model.parameters()).is_cuda

                final_train_loader = DataLoader(
                    cf_dataset, batch_size=trial_batch_size, shuffle=True,
                    pin_memory=torch.cuda.is_available(),
                    num_workers=num_workers, persistent_workers=bool(num_workers)
                )

                current_lr = lr
                for epoch in range(epochs):
                    if epoch > 0:
                        current_lr *= lr_decay
                        
                    train(
                        trial_model, final_train_loader, trial_scores_all,
                        criterion=KLPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                    )

                trial_x, trial_a = trial_model.get_params()
                trial_x = trial_x.detach().cpu().numpy()
                trial_a = trial_a.detach().cpu().numpy()

                pi_i = softmax(trial_x @ trial_a.T, axis=1)
                train_actions = train_data['a']
                train_users = train_data['x_idx']

                print("Train wi info: {}".format(get_weights_info(pi_i[train_users, train_actions], original_policy_prob[train_users, train_actions])))
                print(f"actual reward: {calc_reward(dataset, np.expand_dims(pi_i, -1))}")

                # print(get_weights_info(pi_i, original_policy_prob))
                # validation reward for selection
                return cv_score_model(val_data, trial_scores_all, pi_i)


            # --- run Optuna for this run ---
            study = optuna.create_study(direction="maximize")
            
            if last_best_params is not None:
                study.enqueue_trial(last_best_params)

            study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

            best_params = study.best_params
            last_best_params = best_params  # optional warm-start to next run
            best_hyperparams_by_size[train_size][run] = {
                "params": best_params,
                "reward": study.best_value
            }


            # --- final training with best params on this run’s data ---
            regression_model = RegressionModel(
                n_actions=n_actions, action_context=our_x_orig,
                base_model=LogisticRegression(random_state=12345)
            )
            regression_model.fit(
                train_data['x'], train_data['a'], train_data['r'],
                original_policy_prob[train_data['x_idx'], train_data['a']].squeeze()
            )

            neighberhoodmodel = NeighborhoodModel(
                train_data['x_idx'], train_data['a'],
                our_a_orig, our_x_orig, train_data['r'],
                num_neighbors=best_params['num_neighbors']
            )
            scores_all = torch.as_tensor(
                neighberhoodmodel.predict(all_user_indices),
                device=device, dtype=torch.float32
            )

            model = LinearCFModel(
                n_users, n_actions, emb_dim,
                initial_user_embeddings=T(our_x_orig),
                initial_actions_embeddings=T(our_a_orig)
            ).to(device)
            assert (not torch.cuda.is_available()) or next(model.parameters()).is_cuda

            train_loader = DataLoader(
                cf_dataset, batch_size=batch_size, shuffle=True,
                pin_memory=torch.cuda.is_available(),
                num_workers=num_workers, persistent_workers=bool(num_workers)
            )

            current_lr = best_params['lr']
            for epoch in range(best_params['num_epochs']):
                if epoch > 0:
                    current_lr *= best_params['lr_decay']
                train(
                    model, train_loader, scores_all,
                    criterion=KLPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                )

            # learned embeddings (do NOT overwrite originals)
            learned_x_t, learned_a_t = model.get_params()
            learned_x = learned_x_t.detach().cpu().numpy()
            learned_a = learned_a_t.detach().cpu().numpy()

            # --- produce the per-run result via get_trial_results ---
            trial_res = get_trial_results(
                learned_x, learned_a,          # learned (policy) embeddings
                emb_x, emb_a,                  # ground-truth embedding refs
                original_x, original_a,        # original clean refs
                dataset,
                val_data,                      # use this run's val split
                original_policy_prob,
                neighberhoodmodel,
                regression_model,
                dm
            )

            trial_dicts_this_size.append(trial_res)

            # memory hygiene
            torch.cuda.empty_cache()

        # === aggregate per-run results (mean) and store under this train_size ===
        results[train_size] = _mean_dict(trial_dicts_this_size)

    return pd.DataFrame.from_dict(results, orient='index'), best_hyperparams_by_size

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [4]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params)

Random Item CTR: 0.07066414727263938
Optimal greedy CTR: 0.09999926940951757
Optimal Stochastic CTR: 0.09995326955796031
Our Initial CTR: 0.08610747363354625


In [5]:
num_runs = 1
batch_size = 200
num_neighbors = 6
n_trials_for_optuna = 20
# num_rounds_list = [500, 1000, 2000, 10000, 20000]
# num_rounds_list = [500, 1000, 2000]
num_rounds_list = [15000]


# Manually define your best parameters
best_params_to_use = {
    "lr": 0.096,  # Learning rate
    "num_epochs": 5,  # Number of training epochs
    "batch_size": 64,  # Batch size for training
    "num_neighbors": 8,  # Number of neighbors for neighborhood model
    "lr_decay": 0.85  # Learning rate decay factor
}

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [6]:
print("Value of num_rounds_list:", num_rounds_list)

# Run the optimization
df4, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# # Print best hyperparameters for each training size
# print("\n=== BEST HYPERPARAMETERS BY TRAINING SIZE ===")
# for train_size, params in best_hyperparams_by_size.items():
#     print(f"\nTraining Size: {train_size}")
#     # print(f"Best Reward: {params['reward']:.6f}")
#     print("Parameters:")
#     for param_name, value in params['params'].items():
#         print(f"  {param_name}: {value}")
# print("===========================\n")

# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Value of num_rounds_list: [15000]
Num samples is 10000
{'gini': np.float64(0.46142294550000995), 'ess': np.float64(4503.153508554867), 'max_wi': np.float64(28.177135418152538), 'min_wi': np.float64(0.016210388551656126)}


[I 2025-10-30 08:01:27,160] A new study created in memory with name: no-name-02fa59cb-a71b-43cc-bad2-9faebea491ec
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0754704:   5%|▌         | 1/20 [00:47<14:54, 47.07s/it]

Train wi info: {'gini': np.float64(0.6341531904649482), 'ess': np.float64(410.6799530834273), 'max_wi': np.float64(340.03196344205617), 'min_wi': np.float64(6.750574145678055e-07)}
actual reward: [0.07073206]
{'gini': np.float64(0.6515145781425156), 'ess': np.float64(302.832980423148), 'max_wi': np.float64(320.9408392026547), 'min_wi': np.float64(9.542057321612939e-05)}
Estimated reward: 0.084256
Cross-validated error: 0.004393
Final score CI (reward +- 2*error): [0.075470, 0.093041]
Standard error: 0.000201
Final t_dist CI (reward +- t_0.975*se_hat): [0.083862, 0.084650]
[I 2025-10-30 08:02:14,232] Trial 0 finished with value: 0.07547039259738207 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.07547039259738207.

Trial 1 started


Best trial: 1. Best value: 0.0862457:  10%|█         | 2/20 [01:29<13:21, 44.53s/it]

Train wi info: {'gini': np.float64(0.002059364710955504), 'ess': np.float64(14999.791138953113), 'max_wi': np.float64(1.0206606245737464), 'min_wi': np.float64(0.9782399626297148)}
actual reward: [0.08610614]
{'gini': np.float64(0.0024117653036770884), 'ess': np.float64(9999.810481580604), 'max_wi': np.float64(1.0199487329099943), 'min_wi': np.float64(0.9805369251294258)}
Estimated reward: 0.088389
Cross-validated error: 0.001071
Final score CI (reward +- 2*error): [0.086246, 0.090532]
Standard error: 0.000199
Final t_dist CI (reward +- t_0.975*se_hat): [0.087998, 0.088779]
[I 2025-10-30 08:02:56,982] Trial 1 finished with value: 0.08624573885119707 and parameters: {'lr': 0.0011480265790031463, 'num_epochs': 8, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.8593591602054103}. Best is trial 1 with value: 0.08624573885119707.

Trial 2 started


Best trial: 1. Best value: 0.0862457:  15%|█▌        | 3/20 [02:25<14:00, 49.46s/it]

Train wi info: {'gini': np.float64(0.031774263718026566), 'ess': np.float64(14927.726845855414), 'max_wi': np.float64(1.3417331145105278), 'min_wi': np.float64(0.1829804434582384)}
actual reward: [0.08592787]
{'gini': np.float64(0.03838869695427543), 'ess': np.float64(9932.999526804517), 'max_wi': np.float64(1.4952763187275355), 'min_wi': np.float64(0.22714417646664253)}
Estimated reward: 0.087953
Cross-validated error: 0.001064
Final score CI (reward +- 2*error): [0.085825, 0.090082]
Standard error: 0.000146
Final t_dist CI (reward +- t_0.975*se_hat): [0.087668, 0.088239]
[I 2025-10-30 08:03:52,318] Trial 2 finished with value: 0.08582465081186413 and parameters: {'lr': 0.005340916515672121, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.9352302028672477}. Best is trial 1 with value: 0.08624573885119707.

Trial 3 started


Best trial: 1. Best value: 0.0862457:  20%|██        | 4/20 [03:13<13:03, 49.00s/it]

Train wi info: {'gini': np.float64(0.2556544730002135), 'ess': np.float64(11932.396138935228), 'max_wi': np.float64(11.893847780090093), 'min_wi': np.float64(4.7526719599980845e-05)}
actual reward: [0.08312464]
{'gini': np.float64(0.2770819917779952), 'ess': np.float64(7765.795351669799), 'max_wi': np.float64(7.524935388611695), 'min_wi': np.float64(9.31176974881089e-05)}
Estimated reward: 0.086777
Cross-validated error: 0.001299
Final score CI (reward +- 2*error): [0.084179, 0.089375]
Standard error: 0.000142
Final t_dist CI (reward +- t_0.975*se_hat): [0.086498, 0.087055]
[I 2025-10-30 08:04:40,605] Trial 3 finished with value: 0.08417865361211646 and parameters: {'lr': 0.03125838216298702, 'num_epochs': 2, 'batch_size': 128, 'num_neighbors': 10, 'lr_decay': 0.9611791100563979}. Best is trial 1 with value: 0.08624573885119707.

Trial 4 started


Best trial: 1. Best value: 0.0862457:  25%|██▌       | 5/20 [04:07<12:40, 50.68s/it]

Train wi info: {'gini': np.float64(0.21850913301862143), 'ess': np.float64(12731.97836206695), 'max_wi': np.float64(7.5776009291244195), 'min_wi': np.float64(0.00011609454656004443)}
actual reward: [0.08341922]
{'gini': np.float64(0.2375704827848291), 'ess': np.float64(8308.071331704488), 'max_wi': np.float64(5.310623439434535), 'min_wi': np.float64(0.00013665770860776412)}
Estimated reward: 0.087543
Cross-validated error: 0.001436
Final score CI (reward +- 2*error): [0.084670, 0.090415]
Standard error: 0.000136
Final t_dist CI (reward +- t_0.975*se_hat): [0.087275, 0.087810]
[I 2025-10-30 08:05:34,273] Trial 4 finished with value: 0.0846700011968581 and parameters: {'lr': 0.04069679598953643, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 14, 'lr_decay': 0.8066165584599351}. Best is trial 1 with value: 0.08624573885119707.

Trial 5 started


Best trial: 1. Best value: 0.0862457:  30%|███       | 6/20 [05:01<12:09, 52.09s/it]

Train wi info: {'gini': np.float64(0.014987276504018373), 'ess': np.float64(14988.64392425303), 'max_wi': np.float64(1.1801222575443846), 'min_wi': np.float64(0.8704000790541851)}
actual reward: [0.08609827]
{'gini': np.float64(0.015460604905247607), 'ess': np.float64(9992.071124973945), 'max_wi': np.float64(1.1453563269214109), 'min_wi': np.float64(0.8531011720576362)}
Estimated reward: 0.088230
Cross-validated error: 0.001001
Final score CI (reward +- 2*error): [0.086229, 0.090231]
Standard error: 0.000188
Final t_dist CI (reward +- t_0.975*se_hat): [0.087861, 0.088599]
[I 2025-10-30 08:06:29,079] Trial 5 finished with value: 0.08622888253986195 and parameters: {'lr': 0.002812470044318945, 'num_epochs': 3, 'batch_size': 256, 'num_neighbors': 4, 'lr_decay': 0.9104414216846394}. Best is trial 1 with value: 0.08624573885119707.

Trial 6 started


Best trial: 1. Best value: 0.0862457:  35%|███▌      | 7/20 [05:56<11:28, 52.98s/it]

Train wi info: {'gini': np.float64(0.0032316194334876865), 'ess': np.float64(14999.486393304738), 'max_wi': np.float64(1.0267466599566295), 'min_wi': np.float64(0.9717345182136424)}
actual reward: [0.08610558]
{'gini': np.float64(0.003334281876479319), 'ess': np.float64(9999.639462043935), 'max_wi': np.float64(1.0306210286959823), 'min_wi': np.float64(0.9735838323580764)}
Estimated reward: 0.087853
Cross-validated error: 0.001020
Final score CI (reward +- 2*error): [0.085813, 0.089894]
Standard error: 0.000147
Final t_dist CI (reward +- t_0.975*se_hat): [0.087566, 0.088141]
[I 2025-10-30 08:07:23,905] Trial 6 finished with value: 0.08581265787616799 and parameters: {'lr': 0.0010337757584776175, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 9, 'lr_decay': 0.8595147635102907}. Best is trial 1 with value: 0.08624573885119707.

Trial 7 started


Best trial: 1. Best value: 0.0862457:  40%|████      | 8/20 [06:52<10:47, 53.96s/it]

Train wi info: {'gini': np.float64(0.22533186355441867), 'ess': np.float64(12164.81458263543), 'max_wi': np.float64(12.990606197278128), 'min_wi': np.float64(4.5210332194907353e-07)}
actual reward: [0.07922065]
{'gini': np.float64(0.3076533644908954), 'ess': np.float64(7002.968425382424), 'max_wi': np.float64(16.605624862621248), 'min_wi': np.float64(7.147370711294927e-08)}
Estimated reward: 0.086741
Cross-validated error: 0.001743
Final score CI (reward +- 2*error): [0.083256, 0.090226]
Standard error: 0.000149
Final t_dist CI (reward +- t_0.975*se_hat): [0.086449, 0.087033]
[I 2025-10-30 08:08:19,972] Trial 7 finished with value: 0.08325551053742195 and parameters: {'lr': 0.06151439043575402, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.928477730006921}. Best is trial 1 with value: 0.08624573885119707.

Trial 8 started


Best trial: 8. Best value: 0.0863376:  45%|████▌     | 9/20 [07:46<09:53, 53.99s/it]

Train wi info: {'gini': np.float64(0.10775883864600658), 'ess': np.float64(14407.319100599361), 'max_wi': np.float64(3.5561040607806023), 'min_wi': np.float64(0.044289785494173985)}
actual reward: [0.08549329]
{'gini': np.float64(0.1125661323431576), 'ess': np.float64(9575.00329508383), 'max_wi': np.float64(2.7520972394591796), 'min_wi': np.float64(0.05267716745495645)}
Estimated reward: 0.088513
Cross-validated error: 0.001088
Final score CI (reward +- 2*error): [0.086338, 0.090688]
Standard error: 0.000150
Final t_dist CI (reward +- t_0.975*se_hat): [0.088219, 0.088807]
[I 2025-10-30 08:09:14,021] Trial 8 finished with value: 0.0863376159125565 and parameters: {'lr': 0.02256352748719486, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 8, 'lr_decay': 0.9341403484566515}. Best is trial 8 with value: 0.0863376159125565.

Trial 9 started


Best trial: 8. Best value: 0.0863376:  50%|█████     | 10/20 [08:42<09:04, 54.43s/it]

Train wi info: {'gini': np.float64(0.0012992296477699987), 'ess': np.float64(14999.914787102978), 'max_wi': np.float64(1.0126009507722866), 'min_wi': np.float64(0.9826911799631395)}
actual reward: [0.08610724]
{'gini': np.float64(0.0015119731451772966), 'ess': np.float64(9999.92375972775), 'max_wi': np.float64(1.012996539703997), 'min_wi': np.float64(0.9806336945274147)}
Estimated reward: 0.087979
Cross-validated error: 0.001068
Final score CI (reward +- 2*error): [0.085843, 0.090116]
Standard error: 0.000141
Final t_dist CI (reward +- t_0.975*se_hat): [0.087704, 0.088255]
[I 2025-10-30 08:10:09,435] Trial 9 finished with value: 0.08584312120324863 and parameters: {'lr': 0.0003532197489584862, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 12, 'lr_decay': 0.8845063774829096}. Best is trial 8 with value: 0.0863376159125565.

Trial 10 started


Best trial: 8. Best value: 0.0863376:  55%|█████▌    | 11/20 [09:36<08:09, 54.37s/it]

Train wi info: {'gini': np.float64(0.06465984339173243), 'ess': np.float64(14769.665344446497), 'max_wi': np.float64(1.8100460532174059), 'min_wi': np.float64(0.13779713369876134)}
actual reward: [0.08582512]
{'gini': np.float64(0.06922683909969043), 'ess': np.float64(9824.088268382417), 'max_wi': np.float64(1.822470245960079), 'min_wi': np.float64(0.15131545250898948)}
Estimated reward: 0.088056
Cross-validated error: 0.000971
Final score CI (reward +- 2*error): [0.086114, 0.089998]
Standard error: 0.000168
Final t_dist CI (reward +- t_0.975*se_hat): [0.087726, 0.088385]
[I 2025-10-30 08:11:03,664] Trial 10 finished with value: 0.08611402671727582 and parameters: {'lr': 0.011710805024554153, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 6, 'lr_decay': 0.995255263450908}. Best is trial 8 with value: 0.0863376159125565.

Trial 11 started


Best trial: 11. Best value: 0.0864492:  60%|██████    | 12/20 [10:30<07:14, 54.30s/it]

Train wi info: {'gini': np.float64(0.000160810419929669), 'ess': np.float64(14999.998724228715), 'max_wi': np.float64(1.0014374389040086), 'min_wi': np.float64(0.9985117175465806)}
actual reward: [0.08610791]
{'gini': np.float64(0.0001817157174063983), 'ess': np.float64(9999.998923982184), 'max_wi': np.float64(1.0019389050103877), 'min_wi': np.float64(0.9986370996994036)}
Estimated reward: 0.088407
Cross-validated error: 0.000979
Final score CI (reward +- 2*error): [0.086449, 0.090365]
Standard error: 0.000200
Final t_dist CI (reward +- t_0.975*se_hat): [0.088016, 0.088799]
[I 2025-10-30 08:11:57,801] Trial 11 finished with value: 0.08644917027680868 and parameters: {'lr': 0.00015250461638780186, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.8179944636513288}. Best is trial 11 with value: 0.08644917027680868.

Trial 12 started


Best trial: 11. Best value: 0.0864492:  65%|██████▌   | 13/20 [11:24<06:19, 54.24s/it]

Train wi info: {'gini': np.float64(0.00021621737724720265), 'ess': np.float64(14999.997678413029), 'max_wi': np.float64(1.002643257638776), 'min_wi': np.float64(0.9976318488533853)}
actual reward: [0.08610788]
{'gini': np.float64(0.0002356057757912139), 'ess': np.float64(9999.998184673119), 'max_wi': np.float64(1.0022114749888034), 'min_wi': np.float64(0.9973773373045663)}
Estimated reward: 0.088372
Cross-validated error: 0.001141
Final score CI (reward +- 2*error): [0.086089, 0.090654]
Standard error: 0.000178
Final t_dist CI (reward +- t_0.975*se_hat): [0.088022, 0.088721]
[I 2025-10-30 08:12:51,909] Trial 12 finished with value: 0.08608928872854822 and parameters: {'lr': 0.00016393763838988836, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 5, 'lr_decay': 0.807985156204073}. Best is trial 11 with value: 0.08644917027680868.

Trial 13 started


Best trial: 11. Best value: 0.0864492:  70%|███████   | 14/20 [12:15<05:19, 53.33s/it]

Train wi info: {'gini': np.float64(0.07926424206732917), 'ess': np.float64(14684.28716374684), 'max_wi': np.float64(2.3735896596261354), 'min_wi': np.float64(0.4719479798946445)}
actual reward: [0.08600696]
{'gini': np.float64(0.07615461921454594), 'ess': np.float64(9811.948063850785), 'max_wi': np.float64(1.9852421330421985), 'min_wi': np.float64(0.5713664467081361)}
Estimated reward: 0.088153
Cross-validated error: 0.001106
Final score CI (reward +- 2*error): [0.085942, 0.090365]
Standard error: 0.000143
Final t_dist CI (reward +- t_0.975*se_hat): [0.087873, 0.088434]
[I 2025-10-30 08:13:43,143] Trial 13 finished with value: 0.08594196391283965 and parameters: {'lr': 0.016497430476856468, 'num_epochs': 1, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.9682443609863682}. Best is trial 11 with value: 0.08644917027680868.

Trial 14 started


Best trial: 11. Best value: 0.0864492:  75%|███████▌  | 15/20 [13:13<04:33, 54.68s/it]

Train wi info: {'gini': np.float64(0.0007631021778114533), 'ess': np.float64(14999.971299256438), 'max_wi': np.float64(1.0075472393271923), 'min_wi': np.float64(0.9923623683245487)}
actual reward: [0.08610739]
{'gini': np.float64(0.0007917922984902819), 'ess': np.float64(9999.979514393797), 'max_wi': np.float64(1.0078747448077225), 'min_wi': np.float64(0.9930732451398221)}
Estimated reward: 0.088221
Cross-validated error: 0.001128
Final score CI (reward +- 2*error): [0.085965, 0.090477]
Standard error: 0.000138
Final t_dist CI (reward +- t_0.975*se_hat): [0.087951, 0.088491]
[I 2025-10-30 08:14:40,932] Trial 14 finished with value: 0.08596511531126168 and parameters: {'lr': 0.0001032035882671387, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 15, 'lr_decay': 0.8950016472229572}. Best is trial 11 with value: 0.08644917027680868.

Trial 15 started


Best trial: 11. Best value: 0.0864492:  80%|████████  | 16/20 [14:09<03:39, 54.86s/it]

Train wi info: {'gini': np.float64(0.0007901238391627527), 'ess': np.float64(14999.969230219765), 'max_wi': np.float64(1.0075153689917231), 'min_wi': np.float64(0.9920640015099667)}
actual reward: [0.08610779]
{'gini': np.float64(0.0009165194574688972), 'ess': np.float64(9999.97260954496), 'max_wi': np.float64(1.0091858826398223), 'min_wi': np.float64(0.9929825469775695)}
Estimated reward: 0.088303
Cross-validated error: 0.001118
Final score CI (reward +- 2*error): [0.086066, 0.090540]
Standard error: 0.000198
Final t_dist CI (reward +- t_0.975*se_hat): [0.087915, 0.088692]
[I 2025-10-30 08:15:36,224] Trial 15 finished with value: 0.08606641531248126 and parameters: {'lr': 0.0005048486357088367, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.8348005407533583}. Best is trial 11 with value: 0.08644917027680868.

Trial 16 started


Best trial: 11. Best value: 0.0864492:  85%|████████▌ | 17/20 [15:02<02:43, 54.40s/it]

Train wi info: {'gini': np.float64(0.022619550703169063), 'ess': np.float64(14970.311624765964), 'max_wi': np.float64(1.2415623683595842), 'min_wi': np.float64(0.5501573862298663)}
actual reward: [0.08604149]
{'gini': np.float64(0.026146333491285183), 'ess': np.float64(9973.829186658366), 'max_wi': np.float64(1.2415623683595842), 'min_wi': np.float64(0.4976824868315797)}
Estimated reward: 0.088057
Cross-validated error: 0.001082
Final score CI (reward +- 2*error): [0.085893, 0.090222]
Standard error: 0.000169
Final t_dist CI (reward +- t_0.975*se_hat): [0.087726, 0.088389]
[I 2025-10-30 08:16:29,537] Trial 16 finished with value: 0.08589266566803679 and parameters: {'lr': 0.0065421295248369446, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 6, 'lr_decay': 0.9420827313912915}. Best is trial 11 with value: 0.08644917027680868.

Trial 17 started


Best trial: 11. Best value: 0.0864492:  90%|█████████ | 18/20 [15:45<01:41, 50.97s/it]

Train wi info: {'gini': np.float64(0.007431371283161628), 'ess': np.float64(14997.25111188042), 'max_wi': np.float64(1.0624958714129977), 'min_wi': np.float64(0.918187924469037)}
actual reward: [0.08610529]
{'gini': np.float64(0.007844438002645419), 'ess': np.float64(9997.989589382205), 'max_wi': np.float64(1.0568617153299935), 'min_wi': np.float64(0.8995443054170909)}
Estimated reward: 0.088136
Cross-validated error: 0.001039
Final score CI (reward +- 2*error): [0.086057, 0.090214]
Standard error: 0.000140
Final t_dist CI (reward +- t_0.975*se_hat): [0.087861, 0.088411]
[I 2025-10-30 08:17:12,523] Trial 17 finished with value: 0.08605749521413754 and parameters: {'lr': 0.0021367322229241836, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 13, 'lr_decay': 0.8829759507134307}. Best is trial 11 with value: 0.08644917027680868.

Trial 18 started


Best trial: 11. Best value: 0.0864492:  95%|█████████▌| 19/20 [16:36<00:51, 51.07s/it]

Train wi info: {'gini': np.float64(0.15061922810595654), 'ess': np.float64(13795.065217939986), 'max_wi': np.float64(5.4136054805876945), 'min_wi': np.float64(0.00026304226079662215)}
actual reward: [0.08439124]
{'gini': np.float64(0.16954061078466437), 'ess': np.float64(9056.326186115817), 'max_wi': np.float64(4.261430472950683), 'min_wi': np.float64(0.00015663834949572545)}
Estimated reward: 0.087675
Cross-validated error: 0.001102
Final score CI (reward +- 2*error): [0.085471, 0.089880]
Standard error: 0.000175
Final t_dist CI (reward +- t_0.975*se_hat): [0.087333, 0.088018]
[I 2025-10-30 08:18:03,842] Trial 18 finished with value: 0.08547107348985081 and parameters: {'lr': 0.015215239083721398, 'num_epochs': 4, 'batch_size': 64, 'num_neighbors': 5, 'lr_decay': 0.9076484633916917}. Best is trial 11 with value: 0.08644917027680868.

Trial 19 started


Best trial: 11. Best value: 0.0864492: 100%|██████████| 20/20 [17:26<00:00, 52.32s/it]

Train wi info: {'gini': np.float64(0.0004764007305271415), 'ess': np.float64(14999.988804834464), 'max_wi': np.float64(1.004518773859392), 'min_wi': np.float64(0.9947357519976215)}
actual reward: [0.08610759]
{'gini': np.float64(0.0005610134689238554), 'ess': np.float64(9999.98978424999), 'max_wi': np.float64(1.0049159502487324), 'min_wi': np.float64(0.9950561950481522)}
Estimated reward: 0.088199
Cross-validated error: 0.001055
Final score CI (reward +- 2*error): [0.086088, 0.090309]
Standard error: 0.000160
Final t_dist CI (reward +- t_0.975*se_hat): [0.087884, 0.088513]
[I 2025-10-30 08:18:53,591] Trial 19 finished with value: 0.08608816316581142 and parameters: {'lr': 0.00033735036344289805, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.8347291188329855}. Best is trial 11 with value: 0.08644917027680868.





Num samples is 10000
{'gini': np.float64(0.4765721781663048), 'ess': np.float64(3904.0745315173544), 'max_wi': np.float64(39.45218623820159), 'min_wi': np.float64(0.011492300651965544)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.085,0.08517303,0.08716017,0.08558992,0.08558992,0.7569287,0.0,0.87627132,0.0
15000,0.08610794,0.07890156,0.08612908,0.08852331,0.0805383,0.0805383,0.75692604,0.00022642,0.87627071,0.00028259


### Policy with delta function

In [7]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=10000)

Random Item CTR: 0.07083863592474163
Optimal greedy CTR: 0.09999916436977967
Optimal Stochastic CTR: 0.0999493542444427
Our Initial CTR: 0.08557719469284641


In [None]:
# Run the optimization
df5, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df5[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.4576418729487834), 'ess': np.float64(4642.846415450562), 'max_wi': np.float64(16.91568930542827), 'min_wi': np.float64(0.018040019867868514)}


[I 2025-10-30 08:20:11,638] A new study created in memory with name: no-name-ac14b6f1-7d26-47d5-88ef-58b2d747ab4d
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.081161:   5%|▌         | 1/20 [00:40<12:44, 40.22s/it]

Train wi info: {'gini': np.float64(0.5014083231851084), 'ess': np.float64(439.2106793968737), 'max_wi': np.float64(562.5335914014425), 'min_wi': np.float64(6.01128538720611e-07)}
actual reward: [0.07255944]
{'gini': np.float64(0.496548445906111), 'ess': np.float64(601.483607384798), 'max_wi': np.float64(361.28112852388745), 'min_wi': np.float64(5.658846244252586e-07)}
Estimated reward: 0.087559
Cross-validated error: 0.003199
Final score CI (reward +- 2*error): [0.081161, 0.093957]
Standard error: 0.000161
Final t_dist CI (reward +- t_0.975*se_hat): [0.087244, 0.087874]
[I 2025-10-30 08:20:51,850] Trial 0 finished with value: 0.0811610318997957 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.0811610318997957.

Trial 1 started


Best trial: 1. Best value: 0.0894114:  10%|█         | 2/20 [01:32<14:09, 47.21s/it]

Train wi info: {'gini': np.float64(0.16026689708174674), 'ess': np.float64(13619.435095852583), 'max_wi': np.float64(4.896752438668365), 'min_wi': np.float64(9.931637701438738e-07)}
actual reward: [0.08212599]
{'gini': np.float64(0.22103464443425766), 'ess': np.float64(8437.569465483413), 'max_wi': np.float64(6.09851864301846), 'min_wi': np.float64(1.856183390146365e-06)}
Estimated reward: 0.093256
Cross-validated error: 0.001922
Final score CI (reward +- 2*error): [0.089411, 0.097100]
Standard error: 0.000205
Final t_dist CI (reward +- t_0.975*se_hat): [0.092853, 0.093658]
[I 2025-10-30 08:21:43,964] Trial 1 finished with value: 0.08941141565547737 and parameters: {'lr': 0.03921990567359331, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 3, 'lr_decay': 0.8583249515879748}. Best is trial 1 with value: 0.08941141565547737.

Trial 2 started


Best trial: 1. Best value: 0.0894114:  15%|█▌        | 3/20 [02:22<13:46, 48.63s/it]

Train wi info: {'gini': np.float64(0.011149502066286149), 'ess': np.float64(14992.90174562839), 'max_wi': np.float64(1.1069652496417077), 'min_wi': np.float64(0.784700820993859)}
actual reward: [0.08556486]
{'gini': np.float64(0.012366532379965678), 'ess': np.float64(9994.24263410054), 'max_wi': np.float64(1.1073487015665826), 'min_wi': np.float64(0.7433005176598324)}
Estimated reward: 0.092337
Cross-validated error: 0.001633
Final score CI (reward +- 2*error): [0.089071, 0.095604]
Standard error: 0.000159
Final t_dist CI (reward +- t_0.975*se_hat): [0.092027, 0.092648]
[I 2025-10-30 08:22:34,287] Trial 2 finished with value: 0.0890709019376981 and parameters: {'lr': 0.003504969689400147, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.9405553497916506}. Best is trial 1 with value: 0.08941141565547737.

Trial 3 started


Best trial: 1. Best value: 0.0894114:  20%|██        | 4/20 [03:11<12:57, 48.56s/it]

Train wi info: {'gini': np.float64(0.1565124592342146), 'ess': np.float64(13735.479766314947), 'max_wi': np.float64(4.332858546609094), 'min_wi': np.float64(0.004981422847132749)}
actual reward: [0.08468815]
{'gini': np.float64(0.15786637170316775), 'ess': np.float64(9160.894335762023), 'max_wi': np.float64(3.498926235901595), 'min_wi': np.float64(0.0064747580108126986)}
Estimated reward: 0.093113
Cross-validated error: 0.001885
Final score CI (reward +- 2*error): [0.089343, 0.096883]
Standard error: 0.000168
Final t_dist CI (reward +- t_0.975*se_hat): [0.092784, 0.093442]
[I 2025-10-30 08:23:22,745] Trial 3 finished with value: 0.08934297721608715 and parameters: {'lr': 0.031780991569711946, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 6, 'lr_decay': 0.853721452037973}. Best is trial 1 with value: 0.08941141565547737.

Trial 4 started


Best trial: 1. Best value: 0.0894114:  25%|██▌       | 5/20 [04:01<12:16, 49.07s/it]

Train wi info: {'gini': np.float64(0.0026625339824805213), 'ess': np.float64(14999.631150010842), 'max_wi': np.float64(1.0269596123570137), 'min_wi': np.float64(0.9654121820549882)}
actual reward: [0.08557508]
{'gini': np.float64(0.002933028822405877), 'ess': np.float64(9999.706556960424), 'max_wi': np.float64(1.0269596123570137), 'min_wi': np.float64(0.9664409854373489)}
Estimated reward: 0.092381
Cross-validated error: 0.001693
Final score CI (reward +- 2*error): [0.088994, 0.095767]
Standard error: 0.000128
Final t_dist CI (reward +- t_0.975*se_hat): [0.092129, 0.092633]
[I 2025-10-30 08:24:12,716] Trial 4 finished with value: 0.0889942977419071 and parameters: {'lr': 0.0010745804934764944, 'num_epochs': 9, 'batch_size': 512, 'num_neighbors': 14, 'lr_decay': 0.9152696742615991}. Best is trial 1 with value: 0.08941141565547737.

Trial 5 started


Best trial: 1. Best value: 0.0894114:  30%|███       | 6/20 [04:50<11:30, 49.31s/it]

Train wi info: {'gini': np.float64(0.010984902300831162), 'ess': np.float64(14993.435511163427), 'max_wi': np.float64(1.1411889311506576), 'min_wi': np.float64(0.7932138069607239)}
actual reward: [0.08556537]
{'gini': np.float64(0.011849019090177628), 'ess': np.float64(9995.047212422956), 'max_wi': np.float64(1.138048499663799), 'min_wi': np.float64(0.8248134912097844)}
Estimated reward: 0.092741
Cross-validated error: 0.001713
Final score CI (reward +- 2*error): [0.089315, 0.096167]
Standard error: 0.000168
Final t_dist CI (reward +- t_0.975*se_hat): [0.092411, 0.093071]
[I 2025-10-30 08:25:02,489] Trial 5 finished with value: 0.08931484371646099 and parameters: {'lr': 0.002983699074003499, 'num_epochs': 4, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.8207128393704347}. Best is trial 1 with value: 0.08941141565547737.

Trial 6 started


Best trial: 1. Best value: 0.0894114:  35%|███▌      | 7/20 [05:41<10:44, 49.59s/it]

Train wi info: {'gini': np.float64(0.0017914912487923685), 'ess': np.float64(14999.8376550117), 'max_wi': np.float64(1.0193494995137893), 'min_wi': np.float64(0.9795933382955556)}
actual reward: [0.08557715]
{'gini': np.float64(0.001802633514586749), 'ess': np.float64(9999.890285480184), 'max_wi': np.float64(1.0175310653005545), 'min_wi': np.float64(0.9809921668838868)}
Estimated reward: 0.092849
Cross-validated error: 0.001784
Final score CI (reward +- 2*error): [0.089280, 0.096418]
Standard error: 0.000169
Final t_dist CI (reward +- t_0.975*se_hat): [0.092517, 0.093180]
[I 2025-10-30 08:25:52,656] Trial 6 finished with value: 0.08927973505893629 and parameters: {'lr': 0.0003922944362127035, 'num_epochs': 3, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9107905111999993}. Best is trial 1 with value: 0.08941141565547737.

Trial 7 started


Best trial: 7. Best value: 0.0896881:  40%|████      | 8/20 [06:26<09:38, 48.23s/it]

Train wi info: {'gini': np.float64(0.0020804928874030536), 'ess': np.float64(14999.777203809335), 'max_wi': np.float64(1.0297396354282518), 'min_wi': np.float64(0.9690552407039313)}
actual reward: [0.08557728]
{'gini': np.float64(0.002215400658893667), 'ess': np.float64(9999.831668575172), 'max_wi': np.float64(1.0239749450713747), 'min_wi': np.float64(0.9708976289941735)}
Estimated reward: 0.093011
Cross-validated error: 0.001661
Final score CI (reward +- 2*error): [0.089688, 0.096333]
Standard error: 0.000190
Final t_dist CI (reward +- t_0.975*se_hat): [0.092638, 0.093383]
[I 2025-10-30 08:26:37,967] Trial 7 finished with value: 0.0896880814311813 and parameters: {'lr': 0.00027849365387383127, 'num_epochs': 7, 'batch_size': 64, 'num_neighbors': 4, 'lr_decay': 0.9203832870721949}. Best is trial 7 with value: 0.0896880814311813.

Trial 8 started


Best trial: 8. Best value: 0.0896885:  45%|████▌     | 9/20 [07:20<09:10, 50.07s/it]

Train wi info: {'gini': np.float64(0.07509991703550527), 'ess': np.float64(14588.957001240828), 'max_wi': np.float64(2.256431839183115), 'min_wi': np.float64(0.00048574441709895)}
actual reward: [0.0844664]
{'gini': np.float64(0.10858577234821656), 'ess': np.float64(9530.636483633873), 'max_wi': np.float64(2.5331128661752826), 'min_wi': np.float64(0.0008472676349851707)}
Estimated reward: 0.092867
Cross-validated error: 0.001589
Final score CI (reward +- 2*error): [0.089689, 0.096046]
Standard error: 0.000137
Final t_dist CI (reward +- t_0.975*se_hat): [0.092599, 0.093135]
[I 2025-10-30 08:27:32,090] Trial 8 finished with value: 0.08968850303451999 and parameters: {'lr': 0.03745244158451974, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 9, 'lr_decay': 0.84954110153006}. Best is trial 8 with value: 0.08968850303451999.

Trial 9 started


Best trial: 8. Best value: 0.0896885:  50%|█████     | 10/20 [08:16<08:38, 51.90s/it]

Train wi info: {'gini': np.float64(0.01627383027870906), 'ess': np.float64(14979.638725018347), 'max_wi': np.float64(1.2572220249785926), 'min_wi': np.float64(0.48815611030258627)}
actual reward: [0.08551692]
{'gini': np.float64(0.019882662840971198), 'ess': np.float64(9980.922731676816), 'max_wi': np.float64(1.2417294652571016), 'min_wi': np.float64(0.47264790738641066)}
Estimated reward: 0.092750
Cross-validated error: 0.001784
Final score CI (reward +- 2*error): [0.089182, 0.096318]
Standard error: 0.000130
Final t_dist CI (reward +- t_0.975*se_hat): [0.092495, 0.093004]
[I 2025-10-30 08:28:28,082] Trial 9 finished with value: 0.08918157166370705 and parameters: {'lr': 0.003928896443636316, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.8705285509220615}. Best is trial 8 with value: 0.08968850303451999.

Trial 10 started


Best trial: 10. Best value: 0.0897731:  55%|█████▌    | 11/20 [09:00<07:24, 49.44s/it]

Train wi info: {'gini': np.float64(0.06567335666917454), 'ess': np.float64(14778.648765847965), 'max_wi': np.float64(1.9913680373320253), 'min_wi': np.float64(0.4857306510556267)}
actual reward: [0.08547899]
{'gini': np.float64(0.06330120296100072), 'ess': np.float64(9864.557671456147), 'max_wi': np.float64(1.8256392038202134), 'min_wi': np.float64(0.41750347873482113)}
Estimated reward: 0.093134
Cross-validated error: 0.001681
Final score CI (reward +- 2*error): [0.089773, 0.096495]
Standard error: 0.000142
Final t_dist CI (reward +- t_0.975*se_hat): [0.092856, 0.093412]
[I 2025-10-30 08:29:11,954] Trial 10 finished with value: 0.0897731173349653 and parameters: {'lr': 0.013698428436639026, 'num_epochs': 1, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.9922992872134071}. Best is trial 10 with value: 0.0897731173349653.

Trial 11 started


Best trial: 10. Best value: 0.0897731:  60%|██████    | 12/20 [09:50<06:36, 49.61s/it]

Train wi info: {'gini': np.float64(0.06920052981062275), 'ess': np.float64(14752.942104945732), 'max_wi': np.float64(1.9754780971041197), 'min_wi': np.float64(0.24400514464028789)}
actual reward: [0.08539237]
{'gini': np.float64(0.06888301536393962), 'ess': np.float64(9838.970648528919), 'max_wi': np.float64(1.7803708747845193), 'min_wi': np.float64(0.32516525260814394)}
Estimated reward: 0.092955
Cross-validated error: 0.001762
Final score CI (reward +- 2*error): [0.089430, 0.096479]
Standard error: 0.000139
Final t_dist CI (reward +- t_0.975*se_hat): [0.092682, 0.093227]
[I 2025-10-30 08:30:01,955] Trial 11 finished with value: 0.08943015725688841 and parameters: {'lr': 0.013371697458452786, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.9898806081109537}. Best is trial 10 with value: 0.0897731173349653.

Trial 12 started


Best trial: 10. Best value: 0.0897731:  65%|██████▌   | 13/20 [10:40<05:49, 49.89s/it]

Train wi info: {'gini': np.float64(0.0551934686533476), 'ess': np.float64(14842.985121448144), 'max_wi': np.float64(1.7027764211983578), 'min_wi': np.float64(0.3163940537848398)}
actual reward: [0.08552387]
{'gini': np.float64(0.05318580687657017), 'ess': np.float64(9903.30473617357), 'max_wi': np.float64(1.586581889446515), 'min_wi': np.float64(0.3428279057901277)}
Estimated reward: 0.093350
Cross-validated error: 0.001838
Final score CI (reward +- 2*error): [0.089673, 0.097027]
Standard error: 0.000143
Final t_dist CI (reward +- t_0.975*se_hat): [0.093071, 0.093630]
[I 2025-10-30 08:30:52,482] Trial 12 finished with value: 0.08967346974590484 and parameters: {'lr': 0.011383909951347916, 'num_epochs': 1, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.997639357868493}. Best is trial 10 with value: 0.0897731173349653.

Trial 13 started


Best trial: 10. Best value: 0.0897731:  70%|███████   | 14/20 [11:30<04:59, 49.87s/it]

Train wi info: {'gini': np.float64(0.06886321750341755), 'ess': np.float64(14718.219847781942), 'max_wi': np.float64(1.8331836756468007), 'min_wi': np.float64(0.09236503310655385)}
actual reward: [0.08518728]
{'gini': np.float64(0.07366917324545702), 'ess': np.float64(9785.32411847701), 'max_wi': np.float64(1.7844568138385748), 'min_wi': np.float64(0.07674054841452102)}
Estimated reward: 0.092720
Cross-validated error: 0.001728
Final score CI (reward +- 2*error): [0.089265, 0.096175]
Standard error: 0.000134
Final t_dist CI (reward +- t_0.975*se_hat): [0.092459, 0.092982]
[I 2025-10-30 08:31:42,295] Trial 13 finished with value: 0.08926512157554124 and parameters: {'lr': 0.01385946345030374, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 12, 'lr_decay': 0.9629408227963585}. Best is trial 10 with value: 0.0897731173349653.

Trial 14 started


In [None]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=20000)

Random Item CTR: 0.07042251854546815
Optimal greedy CTR: 0.09999934264692525
Optimal Stochastic CTR: 0.09996075464321043
Our Initial CTR: 0.08647580588501355


In [None]:
# Run the optimization
df6, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df6[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.48022945838555753), 'ess': np.float64(3954.734881758623), 'max_wi': np.float64(38.92603878947978), 'min_wi': np.float64(0.006564213856433456)}


[I 2025-10-29 22:56:19,596] A new study created in memory with name: no-name-8ec402b9-876c-4bb8-a66d-4bd6797153dc
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0789441:   5%|▌         | 1/20 [00:44<14:13, 44.93s/it]

Train wi info: {'gini': np.float64(0.5647763877991416), 'ess': np.float64(519.1888176636096), 'max_wi': np.float64(457.9536865355126), 'min_wi': np.float64(2.0446657031422895e-07)}
actual reward: [0.07126326]
{'gini': np.float64(0.5394520397752793), 'ess': np.float64(2120.742059473937), 'max_wi': np.float64(65.98318679436115), 'min_wi': np.float64(1.1044849972608172e-05)}
Estimated reward: 0.085064
Cross-validated error: 0.003060
Final score CI (reward +- 2*error): [0.078944, 0.091184]
Standard error: 0.000440
Final t_dist CI (reward +- t_0.975*se_hat): [0.084201, 0.085927]
[I 2025-10-29 22:57:04,519] Trial 0 finished with value: 0.07894412984147801 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.07894412984147801.

Trial 1 started


Best trial: 1. Best value: 0.0865205:  10%|█         | 2/20 [01:40<15:19, 51.06s/it]

Train wi info: {'gini': np.float64(0.0007863779554349045), 'ess': np.float64(14999.969084822691), 'max_wi': np.float64(1.0076030872381512), 'min_wi': np.float64(0.9916543966422595)}
actual reward: [0.08647611]
{'gini': np.float64(0.0008179224317415367), 'ess': np.float64(9999.978348793049), 'max_wi': np.float64(1.0069929486184783), 'min_wi': np.float64(0.9932349711827443)}
Estimated reward: 0.088704
Cross-validated error: 0.001092
Final score CI (reward +- 2*error): [0.086521, 0.090888]
Standard error: 0.000401
Final t_dist CI (reward +- t_0.975*se_hat): [0.087919, 0.089489]
[I 2025-10-29 22:57:59,870] Trial 1 finished with value: 0.08652051319743387 and parameters: {'lr': 0.00036401160176591685, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 10, 'lr_decay': 0.9022739448906153}. Best is trial 1 with value: 0.08652051319743387.

Trial 2 started


Best trial: 1. Best value: 0.0865205:  15%|█▌        | 3/20 [02:35<14:56, 52.74s/it]

Train wi info: {'gini': np.float64(0.007769607761714315), 'ess': np.float64(14997.01950828138), 'max_wi': np.float64(1.078126364446322), 'min_wi': np.float64(0.9232227286571972)}
actual reward: [0.08646768]
{'gini': np.float64(0.007908709747124385), 'ess': np.float64(9997.968277417025), 'max_wi': np.float64(1.084424326674272), 'min_wi': np.float64(0.9399592965000942)}
Estimated reward: 0.088651
Cross-validated error: 0.001068
Final score CI (reward +- 2*error): [0.086516, 0.090787]
Standard error: 0.000430
Final t_dist CI (reward +- t_0.975*se_hat): [0.087809, 0.089494]
[I 2025-10-29 22:58:54,601] Trial 2 finished with value: 0.0865155254758192 and parameters: {'lr': 0.0016839190367793331, 'num_epochs': 5, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.9515104073050064}. Best is trial 1 with value: 0.08652051319743387.

Trial 3 started


Best trial: 3. Best value: 0.0872831:  20%|██        | 4/20 [03:25<13:48, 51.78s/it]

Train wi info: {'gini': np.float64(0.0004540259219268768), 'ess': np.float64(14999.989642209843), 'max_wi': np.float64(1.004616340893149), 'min_wi': np.float64(0.9931766514218998)}
actual reward: [0.08647631]
{'gini': np.float64(0.0004543061642774479), 'ess': np.float64(9999.99321265268), 'max_wi': np.float64(1.003843002348909), 'min_wi': np.float64(0.9931766514218998)}
Estimated reward: 0.089660
Cross-validated error: 0.001189
Final score CI (reward +- 2*error): [0.087283, 0.092038]
Standard error: 0.000489
Final t_dist CI (reward +- t_0.975*se_hat): [0.088702, 0.090619]
[I 2025-10-29 22:59:44,904] Trial 3 finished with value: 0.08728306779563991 and parameters: {'lr': 0.00011436107447714788, 'num_epochs': 10, 'batch_size': 128, 'num_neighbors': 3, 'lr_decay': 0.9733366677471595}. Best is trial 3 with value: 0.08728306779563991.

Trial 4 started


Best trial: 3. Best value: 0.0872831:  25%|██▌       | 5/20 [04:20<13:15, 53.02s/it]

Train wi info: {'gini': np.float64(0.0018016781732204006), 'ess': np.float64(14999.827462140498), 'max_wi': np.float64(1.0257119591331458), 'min_wi': np.float64(0.9750659121645034)}
actual reward: [0.08647317]
{'gini': np.float64(0.002941994410129095), 'ess': np.float64(9999.710030988213), 'max_wi': np.float64(1.0257436698604996), 'min_wi': np.float64(0.9686631735572174)}
Estimated reward: 0.089712
Cross-validated error: 0.001226
Final score CI (reward +- 2*error): [0.087259, 0.092165]
Standard error: 0.000481
Final t_dist CI (reward +- t_0.975*se_hat): [0.088770, 0.090654]
[I 2025-10-29 23:00:40,131] Trial 4 finished with value: 0.08725909849772104 and parameters: {'lr': 0.002612253956968504, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.8050295094878982}. Best is trial 3 with value: 0.08728306779563991.

Trial 5 started


Best trial: 3. Best value: 0.0872831:  30%|███       | 6/20 [05:15<12:31, 53.68s/it]

Train wi info: {'gini': np.float64(0.0062918865878293035), 'ess': np.float64(14998.043573002882), 'max_wi': np.float64(1.0727582482340903), 'min_wi': np.float64(0.94038015663639)}
actual reward: [0.08647155]
{'gini': np.float64(0.007482589046686397), 'ess': np.float64(9998.175265117448), 'max_wi': np.float64(1.061214159303979), 'min_wi': np.float64(0.9301650281123572)}
Estimated reward: 0.088324
Cross-validated error: 0.000998
Final score CI (reward +- 2*error): [0.086329, 0.090319]
Standard error: 0.000374
Final t_dist CI (reward +- t_0.975*se_hat): [0.087592, 0.089056]
[I 2025-10-29 23:01:35,081] Trial 5 finished with value: 0.08632879874611067 and parameters: {'lr': 0.002500500781748898, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.873772907550635}. Best is trial 3 with value: 0.08728306779563991.

Trial 6 started


Best trial: 3. Best value: 0.0872831:  35%|███▌      | 7/20 [06:11<11:48, 54.53s/it]

Train wi info: {'gini': np.float64(0.0010554189146184184), 'ess': np.float64(14999.94479356704), 'max_wi': np.float64(1.0117951026076166), 'min_wi': np.float64(0.991037114500996)}
actual reward: [0.08647703]
{'gini': np.float64(0.0010814360764481226), 'ess': np.float64(9999.962116889163), 'max_wi': np.float64(1.0117951026076166), 'min_wi': np.float64(0.99185083618144)}
Estimated reward: 0.088435
Cross-validated error: 0.001133
Final score CI (reward +- 2*error): [0.086168, 0.090702]
Standard error: 0.000390
Final t_dist CI (reward +- t_0.975*se_hat): [0.087671, 0.089199]
[I 2025-10-29 23:02:31,371] Trial 6 finished with value: 0.086168079705336 and parameters: {'lr': 0.00018739219715828908, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.9900528307200647}. Best is trial 3 with value: 0.08728306779563991.

Trial 7 started


Best trial: 3. Best value: 0.0872831:  40%|████      | 8/20 [07:06<10:55, 54.63s/it]

Train wi info: {'gini': np.float64(0.04229570980829008), 'ess': np.float64(14906.577975639142), 'max_wi': np.float64(1.5500401407117104), 'min_wi': np.float64(0.25357330750074414)}
actual reward: [0.08603739]
{'gini': np.float64(0.05357175456401491), 'ess': np.float64(9904.33107567228), 'max_wi': np.float64(1.6731259024644471), 'min_wi': np.float64(0.4641902779247699)}
Estimated reward: 0.089227
Cross-validated error: 0.001149
Final score CI (reward +- 2*error): [0.086930, 0.091524]
Standard error: 0.000457
Final t_dist CI (reward +- t_0.975*se_hat): [0.088330, 0.090123]
[I 2025-10-29 23:03:26,218] Trial 7 finished with value: 0.08692953062541561 and parameters: {'lr': 0.013699032343028099, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.9117270053157481}. Best is trial 3 with value: 0.08728306779563991.

Trial 8 started


Best trial: 3. Best value: 0.0872831:  45%|████▌     | 9/20 [07:51<09:28, 51.69s/it]

Train wi info: {'gini': np.float64(0.00010373231953446894), 'ess': np.float64(14999.999446601392), 'max_wi': np.float64(1.0011433570951747), 'min_wi': np.float64(0.9985255832757662)}
actual reward: [0.08647621]
{'gini': np.float64(0.00010021776393212682), 'ess': np.float64(9999.999665718266), 'max_wi': np.float64(1.0009536343481997), 'min_wi': np.float64(0.9986235316634916)}
Estimated reward: 0.088756
Cross-validated error: 0.001112
Final score CI (reward +- 2*error): [0.086532, 0.090981]
Standard error: 0.000448
Final t_dist CI (reward +- t_0.975*se_hat): [0.087877, 0.089635]
[I 2025-10-29 23:04:11,444] Trial 8 finished with value: 0.08653189670457093 and parameters: {'lr': 0.00012467777941127056, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 6, 'lr_decay': 0.8641494942226629}. Best is trial 3 with value: 0.08728306779563991.

Trial 9 started


Best trial: 3. Best value: 0.0872831:  50%|█████     | 10/20 [08:40<08:26, 50.68s/it]

Train wi info: {'gini': np.float64(0.0030961385331734806), 'ess': np.float64(14999.52933528341), 'max_wi': np.float64(1.0268043448392175), 'min_wi': np.float64(0.9703645197350663)}
actual reward: [0.0864774]
{'gini': np.float64(0.0036300302005328644), 'ess': np.float64(9999.573953743), 'max_wi': np.float64(1.0280164635140825), 'min_wi': np.float64(0.970676485836176)}
Estimated reward: 0.088557
Cross-validated error: 0.001036
Final score CI (reward +- 2*error): [0.086485, 0.090628]
Standard error: 0.000404
Final t_dist CI (reward +- t_0.975*se_hat): [0.087764, 0.089349]
[I 2025-10-29 23:04:59,846] Trial 9 finished with value: 0.08648537144555783 and parameters: {'lr': 0.0009913598420945924, 'num_epochs': 6, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.8392575408529862}. Best is trial 3 with value: 0.08728306779563991.

Trial 10 started


Best trial: 3. Best value: 0.0872831:  55%|█████▌    | 11/20 [09:34<07:45, 51.77s/it]

Train wi info: {'gini': np.float64(0.11433373851533027), 'ess': np.float64(14348.07938311283), 'max_wi': np.float64(2.86897985736309), 'min_wi': np.float64(0.22530863296522738)}
actual reward: [0.08574286]
{'gini': np.float64(0.11544825274554699), 'ess': np.float64(9566.01772162426), 'max_wi': np.float64(2.895109704668287), 'min_wi': np.float64(0.4007595217368208)}
Estimated reward: 0.087912
Cross-validated error: 0.001155
Final score CI (reward +- 2*error): [0.085603, 0.090221]
Standard error: 0.000362
Final t_dist CI (reward +- t_0.975*se_hat): [0.087201, 0.088622]
[I 2025-10-29 23:05:54,107] Trial 10 finished with value: 0.08560268160735161 and parameters: {'lr': 0.015090953047244062, 'num_epochs': 2, 'batch_size': 128, 'num_neighbors': 15, 'lr_decay': 0.998267953818141}. Best is trial 3 with value: 0.08728306779563991.

Trial 11 started


Best trial: 11. Best value: 0.0874826:  60%|██████    | 12/20 [10:22<06:46, 50.77s/it]

Train wi info: {'gini': np.float64(0.006721367917896153), 'ess': np.float64(14997.458485325997), 'max_wi': np.float64(1.093863850832182), 'min_wi': np.float64(0.8768319682709)}
actual reward: [0.08644456]
{'gini': np.float64(0.011815422425389332), 'ess': np.float64(9995.22942449332), 'max_wi': np.float64(1.1398571872286156), 'min_wi': np.float64(0.8797022604541432)}
Estimated reward: 0.089712
Cross-validated error: 0.001115
Final score CI (reward +- 2*error): [0.087483, 0.091942]
Standard error: 0.000484
Final t_dist CI (reward +- t_0.975*se_hat): [0.088763, 0.090661]
[I 2025-10-29 23:06:42,577] Trial 11 finished with value: 0.08748258253041329 and parameters: {'lr': 0.008323377926373086, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.8014525161289329}. Best is trial 11 with value: 0.08748258253041329.

Trial 12 started


Best trial: 11. Best value: 0.0874826:  65%|██████▌   | 13/20 [11:17<06:02, 51.77s/it]

Train wi info: {'gini': np.float64(0.06895687976920223), 'ess': np.float64(14758.690954993279), 'max_wi': np.float64(2.1439104235683923), 'min_wi': np.float64(0.10558917789482014)}
actual reward: [0.08565288]
{'gini': np.float64(0.08410734270377368), 'ess': np.float64(9767.325921496755), 'max_wi': np.float64(2.356873182562045), 'min_wi': np.float64(0.13045605906078353)}
Estimated reward: 0.089394
Cross-validated error: 0.001286
Final score CI (reward +- 2*error): [0.086822, 0.091967]
Standard error: 0.000485
Final t_dist CI (reward +- t_0.975*se_hat): [0.088443, 0.090346]
[I 2025-10-29 23:07:36,636] Trial 12 finished with value: 0.08682153627242409 and parameters: {'lr': 0.02165356912456543, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.945500510299629}. Best is trial 11 with value: 0.08748258253041329.

Trial 13 started


Best trial: 11. Best value: 0.0874826:  70%|███████   | 14/20 [12:06<05:06, 51.15s/it]

Train wi info: {'gini': np.float64(0.021662575448692243), 'ess': np.float64(14973.0779170745), 'max_wi': np.float64(1.3876995195839403), 'min_wi': np.float64(0.3738420133237061)}
actual reward: [0.08627459]
{'gini': np.float64(0.03385633900791028), 'ess': np.float64(9960.643968807402), 'max_wi': np.float64(1.3183463607020973), 'min_wi': np.float64(0.42867498197474124)}
Estimated reward: 0.089133
Cross-validated error: 0.001142
Final score CI (reward +- 2*error): [0.086849, 0.091417]
Standard error: 0.000455
Final t_dist CI (reward +- t_0.975*se_hat): [0.088242, 0.090025]
[I 2025-10-29 23:08:26,351] Trial 13 finished with value: 0.08684935670520832 and parameters: {'lr': 0.006765721073662712, 'num_epochs': 8, 'batch_size': 64, 'num_neighbors': 5, 'lr_decay': 0.8064894745561798}. Best is trial 11 with value: 0.08748258253041329.

Trial 14 started


Best trial: 11. Best value: 0.0874826:  75%|███████▌  | 15/20 [13:02<04:22, 52.49s/it]

Train wi info: {'gini': np.float64(0.001673336128756357), 'ess': np.float64(14999.862548653513), 'max_wi': np.float64(1.0155370767966652), 'min_wi': np.float64(0.9846093041774231)}
actual reward: [0.08647624]
{'gini': np.float64(0.001840979642898114), 'ess': np.float64(9999.890719484116), 'max_wi': np.float64(1.0135750140703568), 'min_wi': np.float64(0.9830288879094293)}
Estimated reward: 0.089678
Cross-validated error: 0.001209
Final score CI (reward +- 2*error): [0.087259, 0.092096]
Standard error: 0.000487
Final t_dist CI (reward +- t_0.975*se_hat): [0.088724, 0.090631]
[I 2025-10-29 23:09:21,966] Trial 14 finished with value: 0.0872593432886565 and parameters: {'lr': 0.00043954896868426914, 'num_epochs': 10, 'batch_size': 128, 'num_neighbors': 3, 'lr_decay': 0.9378126336161909}. Best is trial 11 with value: 0.08748258253041329.

Trial 15 started


Best trial: 11. Best value: 0.0874826:  80%|████████  | 16/20 [13:56<03:31, 52.91s/it]

Train wi info: {'gini': np.float64(0.38915687939228555), 'ess': np.float64(6363.1041191891), 'max_wi': np.float64(65.0895569703682), 'min_wi': np.float64(0.010810240175141103)}
actual reward: [0.08355248]
{'gini': np.float64(0.3687446239390407), 'ess': np.float64(5995.6825095825925), 'max_wi': np.float64(12.69039954798442), 'min_wi': np.float64(0.010367343982937656)}
Estimated reward: 0.088367
Cross-validated error: 0.001990
Final score CI (reward +- 2*error): [0.084386, 0.092348]
Standard error: 0.000460
Final t_dist CI (reward +- t_0.975*se_hat): [0.087466, 0.089268]
[I 2025-10-29 23:10:15,838] Trial 15 finished with value: 0.08438632147250898 and parameters: {'lr': 0.09188296792457884, 'num_epochs': 1, 'batch_size': 512, 'num_neighbors': 5, 'lr_decay': 0.967194022406912}. Best is trial 11 with value: 0.08748258253041329.

Trial 16 started


Best trial: 11. Best value: 0.0874826:  85%|████████▌ | 17/20 [14:48<02:37, 52.65s/it]

Train wi info: {'gini': np.float64(0.02633492026636687), 'ess': np.float64(14965.054721046721), 'max_wi': np.float64(1.2902793767374525), 'min_wi': np.float64(0.7100271163493478)}
actual reward: [0.08636435]
{'gini': np.float64(0.030731879034147692), 'ess': np.float64(9969.135738472514), 'max_wi': np.float64(1.350883670922386), 'min_wi': np.float64(0.766920909454705)}
Estimated reward: 0.089597
Cross-validated error: 0.001232
Final score CI (reward +- 2*error): [0.087132, 0.092061]
Standard error: 0.000468
Final t_dist CI (reward +- t_0.975*se_hat): [0.088680, 0.090513]
[I 2025-10-29 23:11:07,887] Trial 16 finished with value: 0.0871317340055178 and parameters: {'lr': 0.005381249315249361, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 4, 'lr_decay': 0.9242210435355172}. Best is trial 11 with value: 0.08748258253041329.

Trial 17 started


Best trial: 11. Best value: 0.0874826:  90%|█████████ | 18/20 [15:39<01:44, 52.13s/it]

Train wi info: {'gini': np.float64(0.1574835527122286), 'ess': np.float64(13786.082232956087), 'max_wi': np.float64(4.018167883976401), 'min_wi': np.float64(0.028934880253897968)}
actual reward: [0.08481131]
{'gini': np.float64(0.1713751076236398), 'ess': np.float64(9049.712206134942), 'max_wi': np.float64(4.8499690559102095), 'min_wi': np.float64(0.13531197847082568)}
Estimated reward: 0.088197
Cross-validated error: 0.001172
Final score CI (reward +- 2*error): [0.085853, 0.090542]
Standard error: 0.000426
Final t_dist CI (reward +- t_0.975*se_hat): [0.087362, 0.089032]
[I 2025-10-29 23:11:58,801] Trial 17 finished with value: 0.08585261887427564 and parameters: {'lr': 0.04213181937798037, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.8830922977662737}. Best is trial 11 with value: 0.08748258253041329.

Trial 18 started


Best trial: 11. Best value: 0.0874826:  95%|█████████▌| 19/20 [16:32<00:52, 52.44s/it]

Train wi info: {'gini': np.float64(0.0020114217834353108), 'ess': np.float64(14999.796199714254), 'max_wi': np.float64(1.030928651045261), 'min_wi': np.float64(0.9770325120085369)}
actual reward: [0.08647747]
{'gini': np.float64(0.0027865879534815434), 'ess': np.float64(9999.741067860808), 'max_wi': np.float64(1.028284917299221), 'min_wi': np.float64(0.9728758626315079)}
Estimated reward: 0.088517
Cross-validated error: 0.001139
Final score CI (reward +- 2*error): [0.086239, 0.090796]
Standard error: 0.000380
Final t_dist CI (reward +- t_0.975*se_hat): [0.087773, 0.089262]
[I 2025-10-29 23:12:51,956] Trial 18 finished with value: 0.08623857696289441 and parameters: {'lr': 0.0006867280325907604, 'num_epochs': 8, 'batch_size': 64, 'num_neighbors': 12, 'lr_decay': 0.8288813969065628}. Best is trial 11 with value: 0.08748258253041329.

Trial 19 started


Best trial: 11. Best value: 0.0874826: 100%|██████████| 20/20 [17:23<00:00, 52.18s/it]

Train wi info: {'gini': np.float64(0.04342838404880459), 'ess': np.float64(14905.348983379758), 'max_wi': np.float64(1.5194426043234959), 'min_wi': np.float64(0.299840195886946)}
actual reward: [0.08619026]
{'gini': np.float64(0.04977063774797568), 'ess': np.float64(9918.953528445223), 'max_wi': np.float64(1.5295831446853847), 'min_wi': np.float64(0.3173229878401552)}
Estimated reward: 0.089324
Cross-validated error: 0.001258
Final score CI (reward +- 2*error): [0.086808, 0.091840]
Standard error: 0.000466
Final t_dist CI (reward +- t_0.975*se_hat): [0.088410, 0.090237]
[I 2025-10-29 23:13:43,233] Trial 19 finished with value: 0.08680810788897474 and parameters: {'lr': 0.006233937728785646, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 4, 'lr_decay': 0.9801058508596758}. Best is trial 11 with value: 0.08748258253041329.





Num samples is 10000
{'gini': np.float64(0.4652533554243185), 'ess': np.float64(3926.853063685224), 'max_wi': np.float64(51.029251933606915), 'min_wi': np.float64(0.005437645079418442)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08647581,0.0877,0.08769244,0.0906743,0.08747803,0.08747803,0.88083979,0.0,0.74725465,0.0
15000,0.08637521,0.0792184,0.08685918,0.0896319,0.07886415,0.07886773,0.87758895,0.02348538,0.74505549,0.02166221


In [None]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=30000)

Random Item CTR: 0.07069350185865088
Optimal greedy CTR: 0.09999918303816259
Optimal Stochastic CTR: 0.0999509448932121
Our Initial CTR: 0.08653966603258505


In [None]:
# Run the optimization
df7, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df7[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.4791613368557878), 'ess': np.float64(4347.254707099757), 'max_wi': np.float64(20.842543447049458), 'min_wi': np.float64(0.0053891659002369445)}


[I 2025-10-29 23:15:02,697] A new study created in memory with name: no-name-d2526723-dbdd-40a7-bcd1-db2f404e8992
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0784675:   5%|▌         | 1/20 [00:40<12:44, 40.24s/it]

Train wi info: {'gini': np.float64(0.5140795775215294), 'ess': np.float64(2439.845254714719), 'max_wi': np.float64(116.05822770095907), 'min_wi': np.float64(2.4167185259894426e-07)}
actual reward: [0.07062138]
{'gini': np.float64(0.5410011860369925), 'ess': np.float64(1018.7810412118494), 'max_wi': np.float64(185.3093249170202), 'min_wi': np.float64(4.2872526867514565e-08)}
Estimated reward: 0.084684
Cross-validated error: 0.003108
Final score CI (reward +- 2*error): [0.078468, 0.090901]
Standard error: 0.000439
Final t_dist CI (reward +- t_0.975*se_hat): [0.083824, 0.085544]
[I 2025-10-29 23:15:42,939] Trial 0 finished with value: 0.07846753922494211 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.07846753922494211.

Trial 1 started


Best trial: 1. Best value: 0.0855405:  10%|█         | 2/20 [01:30<13:46, 45.92s/it]

Train wi info: {'gini': np.float64(0.0907790595718039), 'ess': np.float64(14580.094117733075), 'max_wi': np.float64(2.943424073606562), 'min_wi': np.float64(0.15011262979464624)}
actual reward: [0.08524246]
{'gini': np.float64(0.11072358314836944), 'ess': np.float64(9600.532193045115), 'max_wi': np.float64(2.5285280896468563), 'min_wi': np.float64(0.2613427769515475)}
Estimated reward: 0.088367
Cross-validated error: 0.001413
Final score CI (reward +- 2*error): [0.085541, 0.091194]
Standard error: 0.000390
Final t_dist CI (reward +- t_0.975*se_hat): [0.087603, 0.089132]
[I 2025-10-29 23:16:32,835] Trial 1 finished with value: 0.08554051488009469 and parameters: {'lr': 0.02874590647019037, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 13, 'lr_decay': 0.9234989446023876}. Best is trial 1 with value: 0.08554051488009469.

Trial 2 started


Best trial: 2. Best value: 0.0860936:  15%|█▌        | 3/20 [02:22<13:49, 48.80s/it]

Train wi info: {'gini': np.float64(0.022038970634619508), 'ess': np.float64(14975.618547859114), 'max_wi': np.float64(1.2331677161373942), 'min_wi': np.float64(0.7956636694514688)}
actual reward: [0.086421]
{'gini': np.float64(0.027254301470760006), 'ess': np.float64(9975.689830099707), 'max_wi': np.float64(1.2798747827531765), 'min_wi': np.float64(0.7904022445607046)}
Estimated reward: 0.088768
Cross-validated error: 0.001337
Final score CI (reward +- 2*error): [0.086094, 0.091442]
Standard error: 0.000399
Final t_dist CI (reward +- t_0.975*se_hat): [0.087987, 0.089549]
[I 2025-10-29 23:17:25,052] Trial 2 finished with value: 0.08609360846174141 and parameters: {'lr': 0.004293362811580342, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 12, 'lr_decay': 0.8113738276380127}. Best is trial 2 with value: 0.08609360846174141.

Trial 3 started


Best trial: 2. Best value: 0.0860936:  20%|██        | 4/20 [03:13<13:13, 49.57s/it]

Train wi info: {'gini': np.float64(0.39787184314004437), 'ess': np.float64(3434.7053344268843), 'max_wi': np.float64(117.06811865187723), 'min_wi': np.float64(0.006451905646078734)}
actual reward: [0.07294148]
{'gini': np.float64(0.47225352356369166), 'ess': np.float64(536.9822903283172), 'max_wi': np.float64(388.5507562311072), 'min_wi': np.float64(0.0026953448074185653)}
Estimated reward: 0.085144
Cross-validated error: 0.002251
Final score CI (reward +- 2*error): [0.080642, 0.089646]
Standard error: 0.000406
Final t_dist CI (reward +- t_0.975*se_hat): [0.084348, 0.085940]
[I 2025-10-29 23:18:15,795] Trial 3 finished with value: 0.08064179761343983 and parameters: {'lr': 0.0789737411749076, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 11, 'lr_decay': 0.9810899366767283}. Best is trial 2 with value: 0.08609360846174141.

Trial 4 started


Best trial: 2. Best value: 0.0860936:  25%|██▌       | 5/20 [04:04<12:33, 50.22s/it]

Train wi info: {'gini': np.float64(0.011605552393093512), 'ess': np.float64(14993.215997785077), 'max_wi': np.float64(1.1258387683102373), 'min_wi': np.float64(0.8459298181094042)}
actual reward: [0.08648154]
{'gini': np.float64(0.014774922928484832), 'ess': np.float64(9992.832445271231), 'max_wi': np.float64(1.1341273025638148), 'min_wi': np.float64(0.8293053160020104)}
Estimated reward: 0.088459
Cross-validated error: 0.001276
Final score CI (reward +- 2*error): [0.085907, 0.091010]
Standard error: 0.000454
Final t_dist CI (reward +- t_0.975*se_hat): [0.087569, 0.089349]
[I 2025-10-29 23:19:07,176] Trial 4 finished with value: 0.08590737962407177 and parameters: {'lr': 0.003326520184796943, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 6, 'lr_decay': 0.8833260752036249}. Best is trial 2 with value: 0.08609360846174141.

Trial 5 started


Best trial: 2. Best value: 0.0860936:  30%|███       | 6/20 [04:54<11:44, 50.31s/it]

Train wi info: {'gini': np.float64(0.04098034691812008), 'ess': np.float64(14899.730163722194), 'max_wi': np.float64(1.9745327665859782), 'min_wi': np.float64(0.15820105856834038)}
actual reward: [0.08584786]
{'gini': np.float64(0.0659420618517532), 'ess': np.float64(9854.304471125244), 'max_wi': np.float64(1.6106803831123067), 'min_wi': np.float64(0.27230351156570154)}
Estimated reward: 0.087475
Cross-validated error: 0.001214
Final score CI (reward +- 2*error): [0.085047, 0.089903]
Standard error: 0.000459
Final t_dist CI (reward +- t_0.975*se_hat): [0.086576, 0.088374]
[I 2025-10-29 23:19:57,662] Trial 5 finished with value: 0.08504666385804663 and parameters: {'lr': 0.028850461240114804, 'num_epochs': 9, 'batch_size': 512, 'num_neighbors': 5, 'lr_decay': 0.8264838284989823}. Best is trial 2 with value: 0.08609360846174141.

Trial 6 started


Best trial: 6. Best value: 0.0863448:  35%|███▌      | 7/20 [05:45<10:55, 50.43s/it]

Train wi info: {'gini': np.float64(0.005056445164845716), 'ess': np.float64(14998.7199079649), 'max_wi': np.float64(1.0510591081512553), 'min_wi': np.float64(0.9520503646611341)}
actual reward: [0.08653613]
{'gini': np.float64(0.006100216971215781), 'ess': np.float64(9998.78307020318), 'max_wi': np.float64(1.0549242094234543), 'min_wi': np.float64(0.9499925413141809)}
Estimated reward: 0.088945
Cross-validated error: 0.001300
Final score CI (reward +- 2*error): [0.086345, 0.091545]
Standard error: 0.000401
Final t_dist CI (reward +- t_0.975*se_hat): [0.088158, 0.089732]
[I 2025-10-29 23:20:48,347] Trial 6 finished with value: 0.08634480497047685 and parameters: {'lr': 0.0014501524070954692, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 12, 'lr_decay': 0.8702739221488771}. Best is trial 6 with value: 0.08634480497047685.

Trial 7 started


Best trial: 7. Best value: 0.0864278:  40%|████      | 8/20 [06:35<10:03, 50.30s/it]

Train wi info: {'gini': np.float64(0.012450426564565058), 'ess': np.float64(14992.376365202244), 'max_wi': np.float64(1.139265211990763), 'min_wi': np.float64(0.8937114318795985)}
actual reward: [0.08652428]
{'gini': np.float64(0.012530740705698038), 'ess': np.float64(9994.970592718517), 'max_wi': np.float64(1.09493247767792), 'min_wi': np.float64(0.9077054974650959)}
Estimated reward: 0.089185
Cross-validated error: 0.001379
Final score CI (reward +- 2*error): [0.086428, 0.091943]
Standard error: 0.000402
Final t_dist CI (reward +- t_0.975*se_hat): [0.088398, 0.089972]
[I 2025-10-29 23:21:38,377] Trial 7 finished with value: 0.08642776478311383 and parameters: {'lr': 0.0038872393674783128, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 12, 'lr_decay': 0.849197626215931}. Best is trial 7 with value: 0.08642776478311383.

Trial 8 started


Best trial: 7. Best value: 0.0864278:  45%|████▌     | 9/20 [07:26<09:13, 50.36s/it]

Train wi info: {'gini': np.float64(0.42261197563622077), 'ess': np.float64(3944.4567776849385), 'max_wi': np.float64(169.12346695194623), 'min_wi': np.float64(0.003512234459100401)}
actual reward: [0.08067806]
{'gini': np.float64(0.42240843040261045), 'ess': np.float64(4943.384343679189), 'max_wi': np.float64(23.99568019350411), 'min_wi': np.float64(0.0076704107289778)}
Estimated reward: 0.086966
Cross-validated error: 0.002196
Final score CI (reward +- 2*error): [0.082574, 0.091358]
Standard error: 0.000379
Final t_dist CI (reward +- t_0.975*se_hat): [0.086223, 0.087709]
[I 2025-10-29 23:22:28,844] Trial 8 finished with value: 0.08257415028464178 and parameters: {'lr': 0.08142670092735241, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 15, 'lr_decay': 0.9064818384809649}. Best is trial 7 with value: 0.08642776478311383.

Trial 9 started


Best trial: 9. Best value: 0.0864702:  50%|█████     | 10/20 [08:17<08:25, 50.51s/it]

Train wi info: {'gini': np.float64(0.0021909533415959497), 'ess': np.float64(14999.764468319527), 'max_wi': np.float64(1.0302461359163877), 'min_wi': np.float64(0.9824867035365284)}
actual reward: [0.08654036]
{'gini': np.float64(0.002019794372748891), 'ess': np.float64(9999.868484458548), 'max_wi': np.float64(1.0176273491467958), 'min_wi': np.float64(0.9838470870244521)}
Estimated reward: 0.088965
Cross-validated error: 0.001248
Final score CI (reward +- 2*error): [0.086470, 0.091461]
Standard error: 0.000411
Final t_dist CI (reward +- t_0.975*se_hat): [0.088159, 0.089771]
[I 2025-10-29 23:23:19,696] Trial 9 finished with value: 0.0864701818888323 and parameters: {'lr': 0.00019340447511719208, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 11, 'lr_decay': 0.9954952171720122}. Best is trial 9 with value: 0.0864701818888323.

Trial 10 started


Best trial: 9. Best value: 0.0864702:  55%|█████▌    | 11/20 [09:03<07:23, 49.23s/it]

Train wi info: {'gini': np.float64(0.0016862415324320754), 'ess': np.float64(14999.860406910404), 'max_wi': np.float64(1.0164549061664991), 'min_wi': np.float64(0.9852936457828504)}
actual reward: [0.08653993]
{'gini': np.float64(0.0015597677410222719), 'ess': np.float64(9999.921832007183), 'max_wi': np.float64(1.0114482906040123), 'min_wi': np.float64(0.9879130143984551)}
Estimated reward: 0.088650
Cross-validated error: 0.001276
Final score CI (reward +- 2*error): [0.086099, 0.091202]
Standard error: 0.000431
Final t_dist CI (reward +- t_0.975*se_hat): [0.087805, 0.089496]
[I 2025-10-29 23:24:06,027] Trial 10 finished with value: 0.0860991259327228 and parameters: {'lr': 0.00014919533847134576, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 9, 'lr_decay': 0.9995989846488095}. Best is trial 9 with value: 0.0864701818888323.

Trial 11 started


Best trial: 9. Best value: 0.0864702:  60%|██████    | 12/20 [09:53<06:37, 49.65s/it]

Train wi info: {'gini': np.float64(8.81643281580694e-05), 'ess': np.float64(14999.999568820642), 'max_wi': np.float64(1.0015610609552745), 'min_wi': np.float64(0.9985007407205879)}
actual reward: [0.08654018]
{'gini': np.float64(8.199170096592351e-05), 'ess': np.float64(9999.999765935072), 'max_wi': np.float64(1.0007743883620674), 'min_wi': np.float64(0.9985007407205879)}
Estimated reward: 0.088865
Cross-validated error: 0.001363
Final score CI (reward +- 2*error): [0.086139, 0.091591]
Standard error: 0.000378
Final t_dist CI (reward +- t_0.975*se_hat): [0.088124, 0.089606]
[I 2025-10-29 23:24:56,649] Trial 11 finished with value: 0.08613941791790695 and parameters: {'lr': 0.00010234300702907791, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 15, 'lr_decay': 0.9470832246387869}. Best is trial 9 with value: 0.0864701818888323.

Trial 12 started


Best trial: 9. Best value: 0.0864702:  65%|██████▌   | 13/20 [10:44<05:48, 49.79s/it]

Train wi info: {'gini': np.float64(0.001753286651094121), 'ess': np.float64(14999.845449427175), 'max_wi': np.float64(1.0172748123550541), 'min_wi': np.float64(0.9832292289414252)}
actual reward: [0.0865402]
{'gini': np.float64(0.0017046346692686804), 'ess': np.float64(9999.906048893692), 'max_wi': np.float64(1.0144622858338646), 'min_wi': np.float64(0.9844857550901964)}
Estimated reward: 0.089045
Cross-validated error: 0.001379
Final score CI (reward +- 2*error): [0.086287, 0.091804]
Standard error: 0.000418
Final t_dist CI (reward +- t_0.975*se_hat): [0.088226, 0.089865]
[I 2025-10-29 23:25:46,755] Trial 12 finished with value: 0.08628670947681273 and parameters: {'lr': 0.0005867896756242938, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.9387336706791596}. Best is trial 9 with value: 0.0864701818888323.

Trial 13 started


Best trial: 9. Best value: 0.0864702:  70%|███████   | 14/20 [11:35<05:00, 50.15s/it]

Train wi info: {'gini': np.float64(0.004003235898853517), 'ess': np.float64(14999.204453939908), 'max_wi': np.float64(1.0397524335778217), 'min_wi': np.float64(0.962025039599802)}
actual reward: [0.08653999]
{'gini': np.float64(0.004038666350039672), 'ess': np.float64(9999.47378417384), 'max_wi': np.float64(1.0360996177160449), 'min_wi': np.float64(0.9680443689463115)}
Estimated reward: 0.088639
Cross-validated error: 0.001240
Final score CI (reward +- 2*error): [0.086159, 0.091120]
Standard error: 0.000446
Final t_dist CI (reward +- t_0.975*se_hat): [0.087766, 0.089513]
[I 2025-10-29 23:26:37,746] Trial 13 finished with value: 0.08615864297734355 and parameters: {'lr': 0.0005219369943293421, 'num_epochs': 3, 'batch_size': 64, 'num_neighbors': 7, 'lr_decay': 0.8494568902936281}. Best is trial 9 with value: 0.0864701818888323.

Trial 14 started


Best trial: 9. Best value: 0.0864702:  75%|███████▌  | 15/20 [12:24<04:10, 50.09s/it]

Train wi info: {'gini': np.float64(0.03587511386737878), 'ess': np.float64(14936.563784693948), 'max_wi': np.float64(1.5500164982365416), 'min_wi': np.float64(0.7086012344158366)}
actual reward: [0.08647009]
{'gini': np.float64(0.03420691943698142), 'ess': np.float64(9962.580107894071), 'max_wi': np.float64(1.3076882781496078), 'min_wi': np.float64(0.7469603040176537)}
Estimated reward: 0.087665
Cross-validated error: 0.001153
Final score CI (reward +- 2*error): [0.085359, 0.089972]
Standard error: 0.000489
Final t_dist CI (reward +- t_0.975*se_hat): [0.086707, 0.088624]
[I 2025-10-29 23:27:27,675] Trial 14 finished with value: 0.0853587323240005 and parameters: {'lr': 0.008282266658467397, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.9656399858680333}. Best is trial 9 with value: 0.0864701818888323.

Trial 15 started


Best trial: 9. Best value: 0.0864702:  80%|████████  | 16/20 [13:16<03:21, 50.49s/it]

Train wi info: {'gini': np.float64(0.008231384712245783), 'ess': np.float64(14996.66337747038), 'max_wi': np.float64(1.074135637543298), 'min_wi': np.float64(0.9356747825275316)}
actual reward: [0.08653073]
{'gini': np.float64(0.008685063856250145), 'ess': np.float64(9997.57309356058), 'max_wi': np.float64(1.074148282702316), 'min_wi': np.float64(0.9199591286846236)}
Estimated reward: 0.088937
Cross-validated error: 0.001394
Final score CI (reward +- 2*error): [0.086149, 0.091726]
Standard error: 0.000390
Final t_dist CI (reward +- t_0.975*se_hat): [0.088172, 0.089702]
[I 2025-10-29 23:28:19,091] Trial 15 finished with value: 0.08614853758995487 and parameters: {'lr': 0.0010493632361863149, 'num_epochs': 4, 'batch_size': 64, 'num_neighbors': 13, 'lr_decay': 0.8989245424620307}. Best is trial 9 with value: 0.0864701818888323.

Trial 16 started


Best trial: 9. Best value: 0.0864702:  85%|████████▌ | 17/20 [14:07<02:31, 50.55s/it]

Train wi info: {'gini': np.float64(0.0885493060118231), 'ess': np.float64(14617.837283617426), 'max_wi': np.float64(2.052476144762335), 'min_wi': np.float64(0.4775874675682365)}
actual reward: [0.08638135]
{'gini': np.float64(0.08354289743111466), 'ess': np.float64(9776.978961864079), 'max_wi': np.float64(2.074907450668041), 'min_wi': np.float64(0.5284766533762602)}
Estimated reward: 0.088735
Cross-validated error: 0.001340
Final score CI (reward +- 2*error): [0.086055, 0.091415]
Standard error: 0.000418
Final t_dist CI (reward +- t_0.975*se_hat): [0.087915, 0.089555]
[I 2025-10-29 23:29:09,789] Trial 16 finished with value: 0.08605483012833882 and parameters: {'lr': 0.009896978013704575, 'num_epochs': 1, 'batch_size': 128, 'num_neighbors': 10, 'lr_decay': 0.8520988332787787}. Best is trial 9 with value: 0.0864701818888323.

Trial 17 started


Best trial: 9. Best value: 0.0864702:  90%|█████████ | 18/20 [14:57<01:40, 50.47s/it]

Train wi info: {'gini': np.float64(0.0005714880449025204), 'ess': np.float64(14999.983431531145), 'max_wi': np.float64(1.0071361225714512), 'min_wi': np.float64(0.9943226654503794)}
actual reward: [0.08653991]
{'gini': np.float64(0.0005342362457207771), 'ess': np.float64(9999.990649156907), 'max_wi': np.float64(1.0060219341051337), 'min_wi': np.float64(0.9952315289066649)}
Estimated reward: 0.088783
Cross-validated error: 0.001324
Final score CI (reward +- 2*error): [0.086135, 0.091432]
Standard error: 0.000385
Final t_dist CI (reward +- t_0.975*se_hat): [0.088028, 0.089538]
[I 2025-10-29 23:30:00,087] Trial 17 finished with value: 0.08613452989593826 and parameters: {'lr': 0.0002027524570197657, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 14, 'lr_decay': 0.8289235851020151}. Best is trial 9 with value: 0.0864701818888323.

Trial 18 started


Best trial: 9. Best value: 0.0864702:  95%|█████████▌| 19/20 [15:47<00:50, 50.22s/it]

Train wi info: {'gini': np.float64(0.004258687408258315), 'ess': np.float64(14999.102603395944), 'max_wi': np.float64(1.0404459710137208), 'min_wi': np.float64(0.9645295059352461)}
actual reward: [0.08653811]
{'gini': np.float64(0.004284587532293961), 'ess': np.float64(9999.408803196207), 'max_wi': np.float64(1.0325169162414054), 'min_wi': np.float64(0.9642340017043597)}
Estimated reward: 0.088968
Cross-validated error: 0.001325
Final score CI (reward +- 2*error): [0.086318, 0.091618]
Standard error: 0.000411
Final t_dist CI (reward +- t_0.975*se_hat): [0.088163, 0.089773]
[I 2025-10-29 23:30:49,728] Trial 18 finished with value: 0.08631800079160452 and parameters: {'lr': 0.0015420045958438756, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.874724814856287}. Best is trial 9 with value: 0.0864701818888323.

Trial 19 started


Best trial: 9. Best value: 0.0864702: 100%|██████████| 20/20 [16:37<00:00, 49.89s/it]

Train wi info: {'gini': np.float64(0.0038997351414210984), 'ess': np.float64(14999.254789295452), 'max_wi': np.float64(1.043412333875427), 'min_wi': np.float64(0.9645372290810577)}
actual reward: [0.08653813]
{'gini': np.float64(0.003776951437468392), 'ess': np.float64(9999.543448183622), 'max_wi': np.float64(1.0338350171542912), 'min_wi': np.float64(0.9706413167283172)}
Estimated reward: 0.088738
Cross-validated error: 0.001460
Final score CI (reward +- 2*error): [0.085818, 0.091659]
Standard error: 0.000436
Final t_dist CI (reward +- t_0.975*se_hat): [0.087883, 0.089594]
[I 2025-10-29 23:31:40,550] Trial 19 finished with value: 0.08581828432786082 and parameters: {'lr': 0.00041123574827441777, 'num_epochs': 2, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.915231414544563}. Best is trial 9 with value: 0.0864701818888323.





Num samples is 10000
{'gini': np.float64(0.47317254545286797), 'ess': np.float64(4208.429618140219), 'max_wi': np.float64(26.142866325336392), 'min_wi': np.float64(0.0062482056410791366)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08653967,0.0866,0.08656501,0.08669117,0.08551839,0.08551839,0.82469903,0.0,0.72168239,0.0
15000,0.08654042,0.0771011,0.08779074,0.0889153,0.07702781,0.07702758,0.82468488,0.00034299,0.721671,0.0004591


In [None]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=40000)

Random Item CTR: 0.07053370144999074
Optimal greedy CTR: 0.09999936716169436
Optimal Stochastic CTR: 0.09995563088920843
Our Initial CTR: 0.08622184481781218


In [None]:
# Run the optimization
df8, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df8[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.4619849198177038), 'ess': np.float64(4469.8281246147635), 'max_wi': np.float64(24.209173244022438), 'min_wi': np.float64(0.011780621871838282)}


[I 2025-10-29 23:32:51,382] A new study created in memory with name: no-name-a308de91-277c-443a-b6f3-741a57e32284
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0766376:   5%|▌         | 1/20 [00:40<12:44, 40.24s/it]

Train wi info: {'gini': np.float64(0.5978836988482632), 'ess': np.float64(584.1645131553638), 'max_wi': np.float64(254.72112163748818), 'min_wi': np.float64(5.413791320068287e-08)}
actual reward: [0.07073242]
{'gini': np.float64(0.6326912024315681), 'ess': np.float64(361.12048268412855), 'max_wi': np.float64(275.55849406163674), 'min_wi': np.float64(4.4788291658694265e-05)}
Estimated reward: 0.084581
Cross-validated error: 0.003972
Final score CI (reward +- 2*error): [0.076638, 0.092524]
Standard error: 0.000374
Final t_dist CI (reward +- t_0.975*se_hat): [0.083848, 0.085314]
[I 2025-10-29 23:33:31,616] Trial 0 finished with value: 0.0766376312209124 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.0766376312209124.

Trial 1 started


Best trial: 1. Best value: 0.0863948:  10%|█         | 2/20 [01:30<13:54, 46.36s/it]

Train wi info: {'gini': np.float64(0.015659344801211596), 'ess': np.float64(14987.850092687553), 'max_wi': np.float64(1.2020596277298043), 'min_wi': np.float64(0.8488521136549332)}
actual reward: [0.08619571]
{'gini': np.float64(0.016238004678726364), 'ess': np.float64(9991.447565728718), 'max_wi': np.float64(1.1701002516010508), 'min_wi': np.float64(0.8665897338568643)}
Estimated reward: 0.089104
Cross-validated error: 0.001355
Final score CI (reward +- 2*error): [0.086395, 0.091813]
Standard error: 0.000365
Final t_dist CI (reward +- t_0.975*se_hat): [0.088388, 0.089819]
[I 2025-10-29 23:34:22,264] Trial 1 finished with value: 0.08639478311017765 and parameters: {'lr': 0.0032279139619765655, 'num_epochs': 4, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.9360103428302388}. Best is trial 1 with value: 0.08639478311017765.

Trial 2 started


Best trial: 1. Best value: 0.0863948:  15%|█▌        | 3/20 [02:22<13:46, 48.60s/it]

Train wi info: {'gini': np.float64(0.4447216271590765), 'ess': np.float64(1487.6060580204396), 'max_wi': np.float64(322.7334268199771), 'min_wi': np.float64(7.606607671241612e-05)}
actual reward: [0.0728065]
{'gini': np.float64(0.47259621718520556), 'ess': np.float64(3549.5480598517197), 'max_wi': np.float64(59.645472080755006), 'min_wi': np.float64(1.173720977603584e-09)}
Estimated reward: 0.086521
Cross-validated error: 0.002625
Final score CI (reward +- 2*error): [0.081270, 0.091771]
Standard error: 0.000295
Final t_dist CI (reward +- t_0.975*se_hat): [0.085942, 0.087100]
[I 2025-10-29 23:35:13,522] Trial 2 finished with value: 0.08127015753186864 and parameters: {'lr': 0.058300554076307125, 'num_epochs': 4, 'batch_size': 64, 'num_neighbors': 15, 'lr_decay': 0.9040323827262895}. Best is trial 1 with value: 0.08639478311017765.

Trial 3 started


Best trial: 1. Best value: 0.0863948:  20%|██        | 4/20 [03:11<13:04, 49.03s/it]

Train wi info: {'gini': np.float64(0.0012041118993918055), 'ess': np.float64(14999.928532990007), 'max_wi': np.float64(1.0115908915691745), 'min_wi': np.float64(0.9818841446978082)}
actual reward: [0.08622235]
{'gini': np.float64(0.0012219781218071066), 'ess': np.float64(9999.951374457063), 'max_wi': np.float64(1.010824875016595), 'min_wi': np.float64(0.9897593359186633)}
Estimated reward: 0.088588
Cross-validated error: 0.001302
Final score CI (reward +- 2*error): [0.085984, 0.091191]
Standard error: 0.000291
Final t_dist CI (reward +- t_0.975*se_hat): [0.088018, 0.089157]
[I 2025-10-29 23:36:03,225] Trial 3 finished with value: 0.08598385697573967 and parameters: {'lr': 0.0006638403861705371, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 14, 'lr_decay': 0.8058197564875161}. Best is trial 1 with value: 0.08639478311017765.

Trial 4 started


Best trial: 1. Best value: 0.0863948:  25%|██▌       | 5/20 [04:01<12:17, 49.14s/it]

Train wi info: {'gini': np.float64(0.011458738517940773), 'ess': np.float64(14993.504592007577), 'max_wi': np.float64(1.1100732108077471), 'min_wi': np.float64(0.9007584013510204)}
actual reward: [0.08620276]
{'gini': np.float64(0.01334787333142275), 'ess': np.float64(9994.197080558493), 'max_wi': np.float64(1.114511704288138), 'min_wi': np.float64(0.907917947794189)}
Estimated reward: 0.088850
Cross-validated error: 0.001292
Final score CI (reward +- 2*error): [0.086266, 0.091434]
Standard error: 0.000371
Final t_dist CI (reward +- t_0.975*se_hat): [0.088123, 0.089577]
[I 2025-10-29 23:36:52,562] Trial 4 finished with value: 0.08626552946267839 and parameters: {'lr': 0.006001912848819504, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 4, 'lr_decay': 0.8201023792932916}. Best is trial 1 with value: 0.08639478311017765.

Trial 5 started


Best trial: 1. Best value: 0.0863948:  30%|███       | 6/20 [04:51<11:34, 49.63s/it]

Train wi info: {'gini': np.float64(0.15816996541589723), 'ess': np.float64(13759.239565747563), 'max_wi': np.float64(4.0521754916532675), 'min_wi': np.float64(0.0013037306008111017)}
actual reward: [0.08337927]
{'gini': np.float64(0.1982604226089482), 'ess': np.float64(8743.666500366728), 'max_wi': np.float64(6.421703774029895), 'min_wi': np.float64(0.0005216853704965476)}
Estimated reward: 0.088849
Cross-validated error: 0.001778
Final score CI (reward +- 2*error): [0.085293, 0.092404]
Standard error: 0.000302
Final t_dist CI (reward +- t_0.975*se_hat): [0.088258, 0.089440]
[I 2025-10-29 23:37:43,141] Trial 5 finished with value: 0.08529339079314192 and parameters: {'lr': 0.027713200784844533, 'num_epochs': 5, 'batch_size': 128, 'num_neighbors': 12, 'lr_decay': 0.8999045067396684}. Best is trial 1 with value: 0.08639478311017765.

Trial 6 started


Best trial: 1. Best value: 0.0863948:  35%|███▌      | 7/20 [05:43<10:55, 50.46s/it]

Train wi info: {'gini': np.float64(0.0021836214456540074), 'ess': np.float64(14999.762799076507), 'max_wi': np.float64(1.022879769739052), 'min_wi': np.float64(0.980229272853177)}
actual reward: [0.08622089]
{'gini': np.float64(0.0026243587371906174), 'ess': np.float64(9999.77369410286), 'max_wi': np.float64(1.0272188693896944), 'min_wi': np.float64(0.9791443198337692)}
Estimated reward: 0.088552
Cross-validated error: 0.001226
Final score CI (reward +- 2*error): [0.086100, 0.091004]
Standard error: 0.000294
Final t_dist CI (reward +- t_0.975*se_hat): [0.087976, 0.089129]
[I 2025-10-29 23:38:35,311] Trial 6 finished with value: 0.08610030658382503 and parameters: {'lr': 0.0008471873045741547, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.8312219319257825}. Best is trial 1 with value: 0.08639478311017765.

Trial 7 started


Best trial: 1. Best value: 0.0863948:  40%|████      | 8/20 [06:34<10:06, 50.53s/it]

Train wi info: {'gini': np.float64(0.06688670036192337), 'ess': np.float64(14769.315898798377), 'max_wi': np.float64(1.8058678453790125), 'min_wi': np.float64(0.23258792075377602)}
actual reward: [0.08579684]
{'gini': np.float64(0.0731823672177685), 'ess': np.float64(9819.123139487176), 'max_wi': np.float64(1.7462171551474026), 'min_wi': np.float64(0.21599073409348113)}
Estimated reward: 0.088894
Cross-validated error: 0.001427
Final score CI (reward +- 2*error): [0.086041, 0.091748]
Standard error: 0.000306
Final t_dist CI (reward +- t_0.975*se_hat): [0.088294, 0.089495]
[I 2025-10-29 23:39:25,974] Trial 7 finished with value: 0.08604051445511637 and parameters: {'lr': 0.01861945581614173, 'num_epochs': 5, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.9041891761271436}. Best is trial 1 with value: 0.08639478311017765.

Trial 8 started


Best trial: 1. Best value: 0.0863948:  45%|████▌     | 9/20 [07:26<09:19, 50.83s/it]

Train wi info: {'gini': np.float64(0.017637601354046843), 'ess': np.float64(14983.596805734287), 'max_wi': np.float64(1.2165595001370215), 'min_wi': np.float64(0.6827001595750402)}
actual reward: [0.08618433]
{'gini': np.float64(0.020237948223251704), 'ess': np.float64(9985.849464811392), 'max_wi': np.float64(1.1808012137514943), 'min_wi': np.float64(0.7115168109099061)}
Estimated reward: 0.088815
Cross-validated error: 0.001357
Final score CI (reward +- 2*error): [0.086100, 0.091529]
Standard error: 0.000333
Final t_dist CI (reward +- t_0.975*se_hat): [0.088162, 0.089467]
[I 2025-10-29 23:40:17,481] Trial 8 finished with value: 0.08609989279659712 and parameters: {'lr': 0.003195117912023243, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.8056016019746369}. Best is trial 1 with value: 0.08639478311017765.

Trial 9 started


Best trial: 1. Best value: 0.0863948:  50%|█████     | 10/20 [08:08<08:01, 48.14s/it]

Train wi info: {'gini': np.float64(0.24605097299372783), 'ess': np.float64(12013.51820565028), 'max_wi': np.float64(7.426877305802824), 'min_wi': np.float64(0.03755889694441543)}
actual reward: [0.08502044]
{'gini': np.float64(0.2396716800067743), 'ess': np.float64(8161.790737824482), 'max_wi': np.float64(5.809916208121122), 'min_wi': np.float64(0.027551358157450502)}
Estimated reward: 0.089221
Cross-validated error: 0.001716
Final score CI (reward +- 2*error): [0.085789, 0.092653]
Standard error: 0.000365
Final t_dist CI (reward +- t_0.975*se_hat): [0.088505, 0.089937]
[I 2025-10-29 23:40:59,580] Trial 9 finished with value: 0.08578854562510081 and parameters: {'lr': 0.03930392450947298, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.8703485312314593}. Best is trial 1 with value: 0.08639478311017765.

Trial 10 started


Best trial: 1. Best value: 0.0863948:  55%|█████▌    | 11/20 [08:58<07:19, 48.82s/it]

Train wi info: {'gini': np.float64(0.0003156719919631676), 'ess': np.float64(14999.994977002916), 'max_wi': np.float64(1.0032776828626924), 'min_wi': np.float64(0.9970790873564669)}
actual reward: [0.08622212]
{'gini': np.float64(0.00030501194787555696), 'ess': np.float64(9999.996963373977), 'max_wi': np.float64(1.0028015464758657), 'min_wi': np.float64(0.9971017367339211)}
Estimated reward: 0.088934
Cross-validated error: 0.001324
Final score CI (reward +- 2*error): [0.086286, 0.091583]
Standard error: 0.000353
Final t_dist CI (reward +- t_0.975*se_hat): [0.088242, 0.089626]
[I 2025-10-29 23:41:49,967] Trial 10 finished with value: 0.08628553861587009 and parameters: {'lr': 0.00011399455455779964, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9870648053763567}. Best is trial 1 with value: 0.08639478311017765.

Trial 11 started


Best trial: 1. Best value: 0.0863948:  60%|██████    | 12/20 [09:47<06:30, 48.83s/it]

Train wi info: {'gini': np.float64(0.00030962532154335634), 'ess': np.float64(14999.99493110626), 'max_wi': np.float64(1.0027339136598095), 'min_wi': np.float64(0.9956030346393706)}
actual reward: [0.08622217]
{'gini': np.float64(0.00029249646769466707), 'ess': np.float64(9999.9970812324), 'max_wi': np.float64(1.0026399475702104), 'min_wi': np.float64(0.9967977384064856)}
Estimated reward: 0.088949
Cross-validated error: 0.001286
Final score CI (reward +- 2*error): [0.086377, 0.091522]
Standard error: 0.000355
Final t_dist CI (reward +- t_0.975*se_hat): [0.088254, 0.089645]
[I 2025-10-29 23:42:38,805] Trial 11 finished with value: 0.08637711382101464 and parameters: {'lr': 0.00011807178543555344, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9871666650350034}. Best is trial 1 with value: 0.08639478311017765.

Trial 12 started


Best trial: 1. Best value: 0.0863948:  65%|██████▌   | 13/20 [10:38<05:46, 49.51s/it]

Train wi info: {'gini': np.float64(0.00030613074427236027), 'ess': np.float64(14999.995300486007), 'max_wi': np.float64(1.0029258730766764), 'min_wi': np.float64(0.9968360384527031)}
actual reward: [0.08622218]
{'gini': np.float64(0.0003077449487509358), 'ess': np.float64(9999.996893756055), 'max_wi': np.float64(1.0028433567491293), 'min_wi': np.float64(0.9965096711371745)}
Estimated reward: 0.088756
Cross-validated error: 0.001343
Final score CI (reward +- 2*error): [0.086070, 0.091441]
Standard error: 0.000399
Final t_dist CI (reward +- t_0.975*se_hat): [0.087974, 0.089537]
[I 2025-10-29 23:43:29,880] Trial 12 finished with value: 0.08606993974573356 and parameters: {'lr': 0.00010954889072580442, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.9875229589587154}. Best is trial 1 with value: 0.08639478311017765.

Trial 13 started


Best trial: 13. Best value: 0.0865602:  70%|███████   | 14/20 [11:31<05:03, 50.51s/it]

Train wi info: {'gini': np.float64(0.004312931964225806), 'ess': np.float64(14999.094084294518), 'max_wi': np.float64(1.0377579645635286), 'min_wi': np.float64(0.960736169839096)}
actual reward: [0.08622079]
{'gini': np.float64(0.004204875267537807), 'ess': np.float64(9999.433996740643), 'max_wi': np.float64(1.0349504752721217), 'min_wi': np.float64(0.960736169839096)}
Estimated reward: 0.089065
Cross-validated error: 0.001252
Final score CI (reward +- 2*error): [0.086560, 0.091570]
Standard error: 0.000353
Final t_dist CI (reward +- t_0.975*se_hat): [0.088372, 0.089758]
[I 2025-10-29 23:44:22,713] Trial 13 finished with value: 0.08656017892556978 and parameters: {'lr': 0.0008313568117582568, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9503969043212386}. Best is trial 13 with value: 0.08656017892556978.

Trial 14 started


Best trial: 13. Best value: 0.0865602:  75%|███████▌  | 15/20 [12:23<04:15, 51.08s/it]

Train wi info: {'gini': np.float64(0.0031065221419325703), 'ess': np.float64(14999.520526085118), 'max_wi': np.float64(1.0292452407879595), 'min_wi': np.float64(0.9623186456922703)}
actual reward: [0.08622059]
{'gini': np.float64(0.003025368824382033), 'ess': np.float64(9999.700723255126), 'max_wi': np.float64(1.0209943330981956), 'min_wi': np.float64(0.9715038512537182)}
Estimated reward: 0.089117
Cross-validated error: 0.001383
Final score CI (reward +- 2*error): [0.086351, 0.091883]
Standard error: 0.000315
Final t_dist CI (reward +- t_0.975*se_hat): [0.088499, 0.089735]
[I 2025-10-29 23:45:15,108] Trial 14 finished with value: 0.08635149679093262 and parameters: {'lr': 0.0006499768380259656, 'num_epochs': 3, 'batch_size': 256, 'num_neighbors': 10, 'lr_decay': 0.9492614311055781}. Best is trial 13 with value: 0.08656017892556978.

Trial 15 started


Best trial: 13. Best value: 0.0865602:  80%|████████  | 16/20 [13:19<03:30, 52.59s/it]

Train wi info: {'gini': np.float64(0.0072950703141422615), 'ess': np.float64(14997.35870028251), 'max_wi': np.float64(1.0815551562929713), 'min_wi': np.float64(0.927361411938522)}
actual reward: [0.08621398]
{'gini': np.float64(0.007901116710787474), 'ess': np.float64(9997.954061670904), 'max_wi': np.float64(1.0834541298872786), 'min_wi': np.float64(0.9277385910598004)}
Estimated reward: 0.088966
Cross-validated error: 0.001374
Final score CI (reward +- 2*error): [0.086219, 0.091713]
Standard error: 0.000339
Final t_dist CI (reward +- t_0.975*se_hat): [0.088301, 0.089631]
[I 2025-10-29 23:46:11,189] Trial 15 finished with value: 0.08621878569288853 and parameters: {'lr': 0.0018335335636432734, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.947158358463077}. Best is trial 13 with value: 0.08656017892556978.

Trial 16 started


Best trial: 13. Best value: 0.0865602:  85%|████████▌ | 17/20 [14:05<02:31, 50.36s/it]

Train wi info: {'gini': np.float64(0.04120958548855171), 'ess': np.float64(14915.351421749121), 'max_wi': np.float64(1.4584762758948002), 'min_wi': np.float64(0.5885947943530271)}
actual reward: [0.08612441]
{'gini': np.float64(0.04201270687493693), 'ess': np.float64(9942.500746571452), 'max_wi': np.float64(1.3654925859821403), 'min_wi': np.float64(0.634166971627863)}
Estimated reward: 0.088780
Cross-validated error: 0.001359
Final score CI (reward +- 2*error): [0.086063, 0.091498]
Standard error: 0.000397
Final t_dist CI (reward +- t_0.975*se_hat): [0.088001, 0.089559]
[I 2025-10-29 23:46:56,382] Trial 16 finished with value: 0.08606276204759057 and parameters: {'lr': 0.007556635345166662, 'num_epochs': 3, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.9415715313648882}. Best is trial 13 with value: 0.08656017892556978.

Trial 17 started


Best trial: 13. Best value: 0.0865602:  90%|█████████ | 18/20 [14:54<01:40, 50.23s/it]

Train wi info: {'gini': np.float64(0.001713777423453065), 'ess': np.float64(14999.853570480946), 'max_wi': np.float64(1.0164549053989294), 'min_wi': np.float64(0.981898946801318)}
actual reward: [0.08622263]
{'gini': np.float64(0.0016520615648294348), 'ess': np.float64(9999.911227747732), 'max_wi': np.float64(1.0163301697110694), 'min_wi': np.float64(0.9884843729026995)}
Estimated reward: 0.089013
Cross-validated error: 0.001340
Final score CI (reward +- 2*error): [0.086334, 0.091693]
Standard error: 0.000363
Final t_dist CI (reward +- t_0.975*se_hat): [0.088301, 0.089726]
[I 2025-10-29 23:47:46,312] Trial 17 finished with value: 0.08633391679191237 and parameters: {'lr': 0.00040595181219876234, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.9300696208081568}. Best is trial 13 with value: 0.08656017892556978.

Trial 18 started


Best trial: 13. Best value: 0.0865602:  95%|█████████▌| 19/20 [15:45<00:50, 50.44s/it]

Train wi info: {'gini': np.float64(0.013976063662075258), 'ess': np.float64(14990.330270145507), 'max_wi': np.float64(1.143844832302929), 'min_wi': np.float64(0.8624492438853154)}
actual reward: [0.08621685]
{'gini': np.float64(0.014054671459171383), 'ess': np.float64(9993.611044745907), 'max_wi': np.float64(1.1136849109643847), 'min_wi': np.float64(0.8947909210474199)}
Estimated reward: 0.088821
Cross-validated error: 0.001455
Final score CI (reward +- 2*error): [0.085911, 0.091731]
Standard error: 0.000323
Final t_dist CI (reward +- t_0.975*se_hat): [0.088188, 0.089454]
[I 2025-10-29 23:48:37,240] Trial 18 finished with value: 0.0859105206622413 and parameters: {'lr': 0.0019558798762453027, 'num_epochs': 3, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.9183017950207495}. Best is trial 13 with value: 0.08656017892556978.

Trial 19 started


Best trial: 13. Best value: 0.0865602: 100%|██████████| 20/20 [16:36<00:00, 49.85s/it]

Train wi info: {'gini': np.float64(0.0010793670251309823), 'ess': np.float64(14999.942065276055), 'max_wi': np.float64(1.0114085071882766), 'min_wi': np.float64(0.9864329664383559)}
actual reward: [0.08622249]
{'gini': np.float64(0.0010930030041018546), 'ess': np.float64(9999.960816091478), 'max_wi': np.float64(1.010312908777103), 'min_wi': np.float64(0.9876586393756517)}
Estimated reward: 0.089102
Cross-validated error: 0.001365
Final score CI (reward +- 2*error): [0.086372, 0.091832]
Standard error: 0.000342
Final t_dist CI (reward +- t_0.975*se_hat): [0.088432, 0.089772]
[I 2025-10-29 23:49:28,325] Trial 19 finished with value: 0.08637199034576998 and parameters: {'lr': 0.00029571771250444057, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.9732130603730548}. Best is trial 13 with value: 0.08656017892556978.





Num samples is 10000
{'gini': np.float64(0.4672203367029539), 'ess': np.float64(4337.54526331691), 'max_wi': np.float64(31.43189456680396), 'min_wi': np.float64(0.019415905542555838)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08622184,0.0884,0.08855226,0.09106993,0.08852082,0.08852082,0.92210476,0.0,0.83772226,0.0
15000,0.08622071,0.07708458,0.08807528,0.08898085,0.0757644,0.07576369,0.92197805,0.00253678,0.83771236,0.00303496


In [None]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=50000)

Random Item CTR: 0.0705882181025533
Optimal greedy CTR: 0.09999934164533562
Optimal Stochastic CTR: 0.09995498601895662
Our Initial CTR: 0.08647501952799874


In [None]:
# Run the optimization
df9, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df9[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.49847821557978683), 'ess': np.float64(3945.2567055809536), 'max_wi': np.float64(32.439454875391334), 'min_wi': np.float64(0.006670832088175685)}


[I 2025-10-29 23:50:46,492] A new study created in memory with name: no-name-4c07dae2-9632-4353-85aa-76944ab38e61
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0803135:   5%|▌         | 1/20 [00:41<13:16, 41.90s/it]

Train wi info: {'gini': np.float64(0.5405386868433234), 'ess': np.float64(471.9726311635205), 'max_wi': np.float64(394.39940460067965), 'min_wi': np.float64(1.1884447622076383e-07)}
actual reward: [0.07110314]
{'gini': np.float64(0.6001936644532129), 'ess': np.float64(297.5981305282253), 'max_wi': np.float64(266.3065819300526), 'min_wi': np.float64(2.499720611856428e-06)}
Estimated reward: 0.087618
Cross-validated error: 0.003652
Final score CI (reward +- 2*error): [0.080313, 0.094923]
Standard error: 0.000298
Final t_dist CI (reward +- t_0.975*se_hat): [0.087034, 0.088202]
[I 2025-10-29 23:51:28,391] Trial 0 finished with value: 0.08031347792721162 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.08031347792721162.

Trial 1 started


Best trial: 1. Best value: 0.0866396:  10%|█         | 2/20 [01:25<12:52, 42.90s/it]

Train wi info: {'gini': np.float64(0.004321097216537783), 'ess': np.float64(14999.080828130871), 'max_wi': np.float64(1.043525849018201), 'min_wi': np.float64(0.9563259443118183)}
actual reward: [0.08647668]
{'gini': np.float64(0.0042337695627619664), 'ess': np.float64(9999.4206200111), 'max_wi': np.float64(1.03804995300518), 'min_wi': np.float64(0.966053617677594)}
Estimated reward: 0.089079
Cross-validated error: 0.001220
Final score CI (reward +- 2*error): [0.086640, 0.091518]
Standard error: 0.000243
Final t_dist CI (reward +- t_0.975*se_hat): [0.088602, 0.089556]
[I 2025-10-29 23:52:11,988] Trial 1 finished with value: 0.0866395763792468 and parameters: {'lr': 0.0014600371063330769, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 15, 'lr_decay': 0.8890865609685429}. Best is trial 1 with value: 0.0866395763792468.

Trial 2 started


Best trial: 1. Best value: 0.0866396:  15%|█▌        | 3/20 [02:15<13:03, 46.08s/it]

Train wi info: {'gini': np.float64(0.026914281909151225), 'ess': np.float64(14964.215755400635), 'max_wi': np.float64(1.2545435412026917), 'min_wi': np.float64(0.7755654188133838)}
actual reward: [0.08647649]
{'gini': np.float64(0.02530337569427735), 'ess': np.float64(9979.193900193299), 'max_wi': np.float64(1.2040258662340402), 'min_wi': np.float64(0.7755654188133838)}
Estimated reward: 0.088628
Cross-validated error: 0.001065
Final score CI (reward +- 2*error): [0.086498, 0.090758]
Standard error: 0.000271
Final t_dist CI (reward +- t_0.975*se_hat): [0.088097, 0.089159]
[I 2025-10-29 23:53:01,851] Trial 2 finished with value: 0.08649820368449856 and parameters: {'lr': 0.0020544988244097094, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 9, 'lr_decay': 0.8733893337605216}. Best is trial 1 with value: 0.0866395763792468.

Trial 3 started


Best trial: 1. Best value: 0.0866396:  20%|██        | 4/20 [03:05<12:40, 47.56s/it]

Train wi info: {'gini': np.float64(0.42420662649770274), 'ess': np.float64(4403.965221497911), 'max_wi': np.float64(128.73832582765812), 'min_wi': np.float64(0.0002392105465334949)}
actual reward: [0.08204853]
{'gini': np.float64(0.42492546265760767), 'ess': np.float64(4346.374024276518), 'max_wi': np.float64(43.09444122331084), 'min_wi': np.float64(5.278531783971909e-05)}
Estimated reward: 0.088514
Cross-validated error: 0.002387
Final score CI (reward +- 2*error): [0.083739, 0.093288]
Standard error: 0.000253
Final t_dist CI (reward +- t_0.975*se_hat): [0.088017, 0.089010]
[I 2025-10-29 23:53:51,675] Trial 3 finished with value: 0.08373906372756865 and parameters: {'lr': 0.05265003940349923, 'num_epochs': 1, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.8139645983456698}. Best is trial 1 with value: 0.0866395763792468.

Trial 4 started


Best trial: 4. Best value: 0.0868192:  25%|██▌       | 5/20 [03:54<12:01, 48.12s/it]

Train wi info: {'gini': np.float64(0.0003505975737163031), 'ess': np.float64(14999.993531887692), 'max_wi': np.float64(1.003926098812466), 'min_wi': np.float64(0.9957349192759894)}
actual reward: [0.08647538]
{'gini': np.float64(0.0003434897859960761), 'ess': np.float64(9999.996012854863), 'max_wi': np.float64(1.003926098812466), 'min_wi': np.float64(0.9958072792136617)}
Estimated reward: 0.089378
Cross-validated error: 0.001279
Final score CI (reward +- 2*error): [0.086819, 0.091937]
Standard error: 0.000334
Final t_dist CI (reward +- t_0.975*se_hat): [0.088723, 0.090033]
[I 2025-10-29 23:54:40,780] Trial 4 finished with value: 0.08681922701747848 and parameters: {'lr': 0.00017890052462954633, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.9295908846744976}. Best is trial 4 with value: 0.08681922701747848.

Trial 5 started


Best trial: 4. Best value: 0.0868192:  30%|███       | 6/20 [04:48<11:44, 50.35s/it]

Train wi info: {'gini': np.float64(0.266774138936524), 'ess': np.float64(10877.78832611166), 'max_wi': np.float64(27.164421753350563), 'min_wi': np.float64(2.6682511240600634e-05)}
actual reward: [0.07917654]
{'gini': np.float64(0.32991355551166673), 'ess': np.float64(6637.784327479013), 'max_wi': np.float64(20.201438846672463), 'min_wi': np.float64(2.4272643036835295e-07)}
Estimated reward: 0.087627
Cross-validated error: 0.001935
Final score CI (reward +- 2*error): [0.083758, 0.091497]
Standard error: 0.000247
Final t_dist CI (reward +- t_0.975*se_hat): [0.087144, 0.088111]
[I 2025-10-29 23:55:35,465] Trial 5 finished with value: 0.08375765060293414 and parameters: {'lr': 0.026124087373182926, 'num_epochs': 9, 'batch_size': 64, 'num_neighbors': 13, 'lr_decay': 0.9870213421637644}. Best is trial 4 with value: 0.08681922701747848.

Trial 6 started


Best trial: 4. Best value: 0.0868192:  35%|███▌      | 7/20 [05:34<10:32, 48.69s/it]

Train wi info: {'gini': np.float64(0.019373589246858572), 'ess': np.float64(14980.51601782322), 'max_wi': np.float64(1.1957268284896594), 'min_wi': np.float64(0.7389755919182543)}
actual reward: [0.08639557]
{'gini': np.float64(0.022918515660519806), 'ess': np.float64(9982.46402011462), 'max_wi': np.float64(1.1800781027005995), 'min_wi': np.float64(0.7502091079634805)}
Estimated reward: 0.088979
Cross-validated error: 0.001281
Final score CI (reward +- 2*error): [0.086418, 0.091541]
Standard error: 0.000265
Final t_dist CI (reward +- t_0.975*se_hat): [0.088460, 0.089499]
[I 2025-10-29 23:56:20,737] Trial 6 finished with value: 0.08641802320607145 and parameters: {'lr': 0.006180323157307555, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 10, 'lr_decay': 0.898683314806597}. Best is trial 4 with value: 0.08681922701747848.

Trial 7 started


Best trial: 4. Best value: 0.0868192:  40%|████      | 8/20 [06:27<09:59, 49.99s/it]

Train wi info: {'gini': np.float64(0.09869017859189486), 'ess': np.float64(14482.611319872869), 'max_wi': np.float64(3.0906133959094606), 'min_wi': np.float64(0.008818204240920073)}
actual reward: [0.08537239]
{'gini': np.float64(0.11625207181899987), 'ess': np.float64(9540.605749693743), 'max_wi': np.float64(3.2096970176523443), 'min_wi': np.float64(0.012900998398793037)}
Estimated reward: 0.089288
Cross-validated error: 0.001363
Final score CI (reward +- 2*error): [0.086562, 0.092015]
Standard error: 0.000277
Final t_dist CI (reward +- t_0.975*se_hat): [0.088745, 0.089831]
[I 2025-10-29 23:57:13,507] Trial 7 finished with value: 0.08656161056433324 and parameters: {'lr': 0.011129518070650704, 'num_epochs': 6, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.9177275850869129}. Best is trial 4 with value: 0.08681922701747848.

Trial 8 started


Best trial: 8. Best value: 0.0869836:  45%|████▌     | 9/20 [07:18<09:15, 50.47s/it]

Train wi info: {'gini': np.float64(0.0003246621410081315), 'ess': np.float64(14999.994611167942), 'max_wi': np.float64(1.0038646979243788), 'min_wi': np.float64(0.9970172624330663)}
actual reward: [0.08647528]
{'gini': np.float64(0.00037350009483934235), 'ess': np.float64(9999.995354919101), 'max_wi': np.float64(1.0038646979243788), 'min_wi': np.float64(0.9952511029909924)}
Estimated reward: 0.089526
Cross-validated error: 0.001271
Final score CI (reward +- 2*error): [0.086984, 0.092069]
Standard error: 0.000335
Final t_dist CI (reward +- t_0.975*se_hat): [0.088871, 0.090182]
[I 2025-10-29 23:58:05,032] Trial 8 finished with value: 0.08698357938820662 and parameters: {'lr': 0.0002519534437193718, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.8966316258213975}. Best is trial 8 with value: 0.08698357938820662.

Trial 9 started


Best trial: 8. Best value: 0.0869836:  50%|█████     | 10/20 [08:09<08:26, 50.66s/it]

Train wi info: {'gini': np.float64(0.19371366793190234), 'ess': np.float64(13197.704571143719), 'max_wi': np.float64(5.237285804876578), 'min_wi': np.float64(0.0023176321956485603)}
actual reward: [0.08339717]
{'gini': np.float64(0.22619749808433412), 'ess': np.float64(8445.236271261088), 'max_wi': np.float64(5.672139104956475), 'min_wi': np.float64(0.0020508068788459757)}
Estimated reward: 0.089125
Cross-validated error: 0.001746
Final score CI (reward +- 2*error): [0.085633, 0.092618]
Standard error: 0.000279
Final t_dist CI (reward +- t_0.975*se_hat): [0.088579, 0.089672]
[I 2025-10-29 23:58:56,110] Trial 9 finished with value: 0.08563313308980049 and parameters: {'lr': 0.031152823807551817, 'num_epochs': 5, 'batch_size': 256, 'num_neighbors': 8, 'lr_decay': 0.9925438463490047}. Best is trial 8 with value: 0.08698357938820662.

Trial 10 started


Best trial: 10. Best value: 0.0871209:  55%|█████▌    | 11/20 [08:59<07:35, 50.57s/it]

Train wi info: {'gini': np.float64(0.00023217862677382253), 'ess': np.float64(14999.997253642347), 'max_wi': np.float64(1.0027316224890255), 'min_wi': np.float64(0.9972059934040124)}
actual reward: [0.08647545]
{'gini': np.float64(0.00024063278509661243), 'ess': np.float64(9999.998090637868), 'max_wi': np.float64(1.0027316224890255), 'min_wi': np.float64(0.997985661286202)}
Estimated reward: 0.089572
Cross-validated error: 0.001226
Final score CI (reward +- 2*error): [0.087121, 0.092024]
Standard error: 0.000334
Final t_dist CI (reward +- t_0.975*se_hat): [0.088918, 0.090227]
[I 2025-10-29 23:59:46,469] Trial 10 finished with value: 0.08712088636362494 and parameters: {'lr': 0.000125641767401922, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.952985330867308}. Best is trial 10 with value: 0.08712088636362494.

Trial 11 started


Best trial: 11. Best value: 0.0872957:  60%|██████    | 12/20 [09:50<06:45, 50.67s/it]

Train wi info: {'gini': np.float64(0.000166805015262277), 'ess': np.float64(14999.998584535131), 'max_wi': np.float64(1.0019233143301158), 'min_wi': np.float64(0.9979695311942451)}
actual reward: [0.08647553]
{'gini': np.float64(0.00017200638710346166), 'ess': np.float64(9999.999016371836), 'max_wi': np.float64(1.001669038000072), 'min_wi': np.float64(0.9980110906217312)}
Estimated reward: 0.089582
Cross-validated error: 0.001143
Final score CI (reward +- 2*error): [0.087296, 0.091868]
Standard error: 0.000335
Final t_dist CI (reward +- t_0.975*se_hat): [0.088926, 0.090237]
[I 2025-10-30 00:00:37,390] Trial 11 finished with value: 0.0872957294752143 and parameters: {'lr': 0.00010043902994098023, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.9531976773550543}. Best is trial 11 with value: 0.0872957294752143.

Trial 12 started


Best trial: 11. Best value: 0.0872957:  65%|██████▌   | 13/20 [10:41<05:55, 50.80s/it]

Train wi info: {'gini': np.float64(0.0018998752456117725), 'ess': np.float64(14999.821595102569), 'max_wi': np.float64(1.0160859565685534), 'min_wi': np.float64(0.9800510564683026)}
actual reward: [0.08647476]
{'gini': np.float64(0.0019788900679127757), 'ess': np.float64(9999.872513847171), 'max_wi': np.float64(1.0192017957278419), 'min_wi': np.float64(0.9835836011628316)}
Estimated reward: 0.089197
Cross-validated error: 0.001149
Final score CI (reward +- 2*error): [0.086900, 0.091494]
Standard error: 0.000304
Final t_dist CI (reward +- t_0.975*se_hat): [0.088601, 0.089793]
[I 2025-10-30 00:01:28,488] Trial 12 finished with value: 0.08689969880618965 and parameters: {'lr': 0.000515675336563925, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.9587997765031546}. Best is trial 11 with value: 0.0872957294752143.

Trial 13 started


Best trial: 11. Best value: 0.0872957:  70%|███████   | 14/20 [11:33<05:05, 50.90s/it]

Train wi info: {'gini': np.float64(0.0017269871370163103), 'ess': np.float64(14999.852666775007), 'max_wi': np.float64(1.0147600634812677), 'min_wi': np.float64(0.9839214511281156)}
actual reward: [0.0864742]
{'gini': np.float64(0.0018451478322954883), 'ess': np.float64(9999.890443594264), 'max_wi': np.float64(1.0147600634812677), 'min_wi': np.float64(0.9860685426206565)}
Estimated reward: 0.089255
Cross-validated error: 0.001226
Final score CI (reward +- 2*error): [0.086803, 0.091707]
Standard error: 0.000303
Final t_dist CI (reward +- t_0.975*se_hat): [0.088661, 0.089849]
[I 2025-10-30 00:02:19,598] Trial 13 finished with value: 0.08680307131481749 and parameters: {'lr': 0.0005531281768862965, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.951464835521896}. Best is trial 11 with value: 0.0872957294752143.

Trial 14 started


Best trial: 11. Best value: 0.0872957:  75%|███████▌  | 15/20 [12:24<04:15, 51.03s/it]

Train wi info: {'gini': np.float64(0.00039261648968448343), 'ess': np.float64(14999.992301439011), 'max_wi': np.float64(1.0033603049557995), 'min_wi': np.float64(0.9963223429281028)}
actual reward: [0.08647533]
{'gini': np.float64(0.00039727483267905963), 'ess': np.float64(9999.994833452398), 'max_wi': np.float64(1.0033538650333735), 'min_wi': np.float64(0.9969643027636744)}
Estimated reward: 0.089204
Cross-validated error: 0.001314
Final score CI (reward +- 2*error): [0.086575, 0.091833]
Standard error: 0.000306
Final t_dist CI (reward +- t_0.975*se_hat): [0.088605, 0.089804]
[I 2025-10-30 00:03:10,943] Trial 14 finished with value: 0.08657547789488285 and parameters: {'lr': 0.00010058690207099576, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 5, 'lr_decay': 0.9527688406545092}. Best is trial 11 with value: 0.0872957294752143.

Trial 15 started


Best trial: 11. Best value: 0.0872957:  80%|████████  | 16/20 [13:14<03:23, 50.85s/it]

Train wi info: {'gini': np.float64(0.00295356248554486), 'ess': np.float64(14999.564565819837), 'max_wi': np.float64(1.0294544034205535), 'min_wi': np.float64(0.9707744838792892)}
actual reward: [0.0864743]
{'gini': np.float64(0.003111584665889985), 'ess': np.float64(9999.681997664204), 'max_wi': np.float64(1.0261010202705239), 'min_wi': np.float64(0.9735902516610155)}
Estimated reward: 0.088998
Cross-validated error: 0.001154
Final score CI (reward +- 2*error): [0.086691, 0.091305]
Standard error: 0.000292
Final t_dist CI (reward +- t_0.975*se_hat): [0.088426, 0.089569]
[I 2025-10-30 00:04:01,380] Trial 15 finished with value: 0.08669059463960739 and parameters: {'lr': 0.0007399173823907938, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9687463669029919}. Best is trial 11 with value: 0.0872957294752143.

Trial 16 started


Best trial: 11. Best value: 0.0872957:  85%|████████▌ | 17/20 [13:57<02:24, 48.25s/it]

Train wi info: {'gini': np.float64(0.00015461642102315327), 'ess': np.float64(14999.998781802591), 'max_wi': np.float64(1.001656128999762), 'min_wi': np.float64(0.9982788171818029)}
actual reward: [0.0864754]
{'gini': np.float64(0.00016477590448724246), 'ess': np.float64(9999.99910806363), 'max_wi': np.float64(1.0019694447044774), 'min_wi': np.float64(0.9985373974031988)}
Estimated reward: 0.089565
Cross-validated error: 0.001267
Final score CI (reward +- 2*error): [0.087030, 0.092100]
Standard error: 0.000334
Final t_dist CI (reward +- t_0.975*se_hat): [0.088910, 0.090220]
[I 2025-10-30 00:04:43,576] Trial 16 finished with value: 0.08703011847482163 and parameters: {'lr': 0.00010690486128840005, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.9277481505916149}. Best is trial 11 with value: 0.0872957294752143.

Trial 17 started


Best trial: 11. Best value: 0.0872957:  90%|█████████ | 18/20 [14:48<01:38, 49.08s/it]

Train wi info: {'gini': np.float64(0.0009255675386077663), 'ess': np.float64(14999.957599526353), 'max_wi': np.float64(1.0094133391722646), 'min_wi': np.float64(0.9925276545204944)}
actual reward: [0.08647506]
{'gini': np.float64(0.0009584137391483161), 'ess': np.float64(9999.970023239917), 'max_wi': np.float64(1.0082652300004946), 'min_wi': np.float64(0.9914612353098216)}
Estimated reward: 0.088986
Cross-validated error: 0.001239
Final score CI (reward +- 2*error): [0.086508, 0.091464]
Standard error: 0.000293
Final t_dist CI (reward +- t_0.975*se_hat): [0.088412, 0.089560]
[I 2025-10-30 00:05:34,604] Trial 17 finished with value: 0.08650802093945432 and parameters: {'lr': 0.00027225300053085474, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9743124349743256}. Best is trial 11 with value: 0.0872957294752143.

Trial 18 started


Best trial: 11. Best value: 0.0872957:  95%|█████████▌| 19/20 [15:38<00:49, 49.46s/it]

Train wi info: {'gini': np.float64(0.004242358079451612), 'ess': np.float64(14999.115156063288), 'max_wi': np.float64(1.0350545581678174), 'min_wi': np.float64(0.9560888271347018)}
actual reward: [0.08647202]
{'gini': np.float64(0.004179398382807423), 'ess': np.float64(9999.434529961114), 'max_wi': np.float64(1.0323227147710987), 'min_wi': np.float64(0.9656071095239178)}
Estimated reward: 0.089230
Cross-validated error: 0.001276
Final score CI (reward +- 2*error): [0.086678, 0.091781]
Standard error: 0.000317
Final t_dist CI (reward +- t_0.975*se_hat): [0.088608, 0.089852]
[I 2025-10-30 00:06:24,934] Trial 18 finished with value: 0.0866781603445874 and parameters: {'lr': 0.001211098479460515, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 4, 'lr_decay': 0.9354736847647402}. Best is trial 11 with value: 0.0872957294752143.

Trial 19 started


Best trial: 11. Best value: 0.0872957: 100%|██████████| 20/20 [16:28<00:00, 49.44s/it]

Train wi info: {'gini': np.float64(0.026892551826398162), 'ess': np.float64(14963.437547063657), 'max_wi': np.float64(1.3463478538445457), 'min_wi': np.float64(0.643063360098534)}
actual reward: [0.08640608]
{'gini': np.float64(0.028284390029761717), 'ess': np.float64(9973.311320912575), 'max_wi': np.float64(1.2776541469697276), 'min_wi': np.float64(0.643063360098534)}
Estimated reward: 0.088900
Cross-validated error: 0.001192
Final score CI (reward +- 2*error): [0.086515, 0.091285]
Standard error: 0.000257
Final t_dist CI (reward +- t_0.975*se_hat): [0.088396, 0.089404]
[I 2025-10-30 00:07:15,368] Trial 19 finished with value: 0.08651518692572786 and parameters: {'lr': 0.003224914608873256, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.996574043659471}. Best is trial 11 with value: 0.0872957294752143.





Num samples is 10000
{'gini': np.float64(0.4940130647098055), 'ess': np.float64(2945.110828404994), 'max_wi': np.float64(95.5346006507542), 'min_wi': np.float64(0.008504698012894578)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08647502,0.0876,0.0875267,0.09086329,0.08530194,0.08530194,0.80232812,0.0,0.84032376,0.0
15000,0.08647562,0.07880134,0.08684865,0.08955903,0.08058818,0.0805882,0.80232088,9.224e-05,0.84031447,0.00013923


In [None]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.086,0.08594797,0.08791196,0.08599389,0.08599389,0.7569287,0.0,0.87627132,0.0
15000,0.08610291,0.0778304,0.08748281,0.09035353,0.07790566,0.07790813,0.75684444,0.00513398,0.87628721,0.0074503


### Poicy Via argmax(r_hat - error_hat) through cross validation

In [None]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.086,0.08594797,0.08791196,0.08599389,0.08599389,0.7569287,0.0,0.87627132,0.0
15000,0.08610291,0.0778304,0.08748281,0.09035353,0.07790566,0.07790813,0.75684444,0.00513398,0.87628721,0.0074503


### Policy Via using actual policy value

In [None]:
# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.086,0.08594797,0.08791196,0.08599389,0.08599389,0.7569287,0.0,0.87627132,0.0
15000,0.08610291,0.0778304,0.08748281,0.09035353,0.07790566,0.07790813,0.75684444,0.00513398,0.87628721,0.0074503
