In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import sys

sys.path.append("/code")

from tqdm import tqdm
import torch
import time
# device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# import gym
# import recogym

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

torch.backends.cudnn.benchmark = torch.cuda.is_available()
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")  # TF32 = big speedup on Ada


from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
import optuna
# from memory_profiler import profile


from estimators import (
    DirectMethod as DM
)

from simulation_utils import (
    eval_policy,
    generate_dataset,
    create_simulation_data_from_pi,
    get_train_data,
    get_opl_results_dict,
    CustomCFDataset,
    calc_reward,
    get_weights_info
)

from models import (    
    LinearCFModel,
    NeighborhoodModel,
    BPRModel, 
    RegressionModel
)

from training_utils import (
    train,
    validation_loop, 
    cv_score_model
 )

from custom_losses import (
    SNDRPolicyLoss,
    IPWPolicyLoss, 
    KLPolicyLoss
    )

random_state=12345
random_ = check_random_state(random_state)

pd.options.display.float_format = '{:,.8f}'.format

Using device: cuda
Using device: cuda
Using device: cuda


In [2]:
def get_trial_results(
    our_x, 
    our_a, 
    emb_x, 
    emb_a, 
    original_x, 
    original_a, 
    dataset, 
    val_data, 
    original_policy_prob, 
    neighberhoodmodel, 
    regression_model, 
    dm
):
    t0 = time.time()
    policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
    policy_reward = calc_reward(dataset, policy)
    eval_metrics = eval_policy(neighberhoodmodel, val_data, original_policy_prob, policy)
    action_diff_to_real = np.sqrt(np.mean((emb_a - our_a) ** 2))
    action_delta = np.sqrt(np.mean((original_a - our_a) ** 2))
    context_diff_to_real = np.sqrt(np.mean((emb_x - our_x) ** 2))
    context_delta = np.sqrt(np.mean((original_x - our_x) ** 2))

    row = np.concatenate([
        np.atleast_1d(policy_reward),
        np.atleast_1d(eval_metrics),
        np.atleast_1d(action_diff_to_real),
        np.atleast_1d(action_delta),
        np.atleast_1d(context_diff_to_real),
        np.atleast_1d(context_delta)
    ])
    reg_dm = dm.estimate_policy_value(policy[val_data['x_idx']], regression_model.predict(val_data['x']))
    reg_results = np.array([reg_dm])
    conv_results = np.array([row])
    print(f"Evaluation total results time: {time.time() - t0} seconds")
    return get_opl_results_dict(reg_results, conv_results)

## `trainer_trial` Function

This function runs policy learning experiments using offline bandit data and evaluates various estimators.

### Parameters
- **num_runs** (int): Number of experimental runs per training size
- **num_neighbors** (int): Number of neighbors to consider in the neighborhood model
- **num_rounds_list** (list): List of training set sizes to evaluate
- **dataset** (dict): Contains dataset information including embeddings, action probabilities, and reward probabilities
- **batch_size** (int): Batch size for training the policy model
- **num_epochs** (int): Number of training epochs for each experiment
- **lr** (float, default=0.001): Learning rate for the optimizer

### Process Flow
1. Initializes result structures and retrieval models
2. For each training size in `num_rounds_list`:
   - Creates a uniform logging policy and simulates data
   - Generates training data for offline learning
   - Fits regression and neighborhood models for reward estimation
   - Initializes and trains a counterfactual policy model
   - Evaluates policy performance using various estimators
   - Collects metrics on policy reward and embedding quality

### Returns
- **DataFrame**: Results table with rows indexed by training size and columns for various metrics:
  - `policy_rewards`: True expected reward of the learned policy
  - Various estimator errors (`ipw`, `reg_dm`, `conv_dm`, `conv_dr`, `conv_sndr`)
  - Variance metrics for each estimator
  - Embedding quality metrics comparing learned representations to ground truth

### Implementation Notes
- Uses uniform random logging policy for collecting offline data
- Employs Self-Normalized Doubly Robust (SNDR) policy learning
- Measures embedding quality via RMSE to original/ground truth embeddings

In [3]:
def trainer_trial(
    num_runs,
    num_neighbors,
    train_sizes,
    dataset,
    batch_size,
    val_size=2000,
    n_trials=10,    
    prev_best_params=None
):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = torch.cuda.is_available()
    if torch.cuda.is_available():
        torch.set_float32_matmul_precision("high")

    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]

    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]

    all_user_indices = np.arange(n_users, dtype=np.int64)

    def T(x):
        return torch.as_tensor(x, device=device, dtype=torch.float32)

    def _mean_dict(dicts):
        """
        Robust mean over a list of dicts with numeric/scalar/1D-array values.
        Returns a single dict with elementwise means.
        """
        if not dicts:
            return {}
        keys = dicts[0].keys()
        out = {}
        for k in keys:
            vals = [d[k] for d in dicts if k in d]
            # try to convert each to np.array and average
            arrs = [np.asarray(v) for v in vals]
            # broadcast to same shape if scalars/1D
            stacked = np.stack(arrs, axis=0)
            out[k] = np.mean(stacked, axis=0)
        return out

    # ===== unpack dataset (keep originals safe) =====
    our_x_orig, our_a_orig = our_x, our_a
    emb_x, emb_a = emb_x, emb_a
    original_x, original_a = original_x, original_a
    n_users, n_actions, emb_dim = n_users, n_actions, emb_dim
    all_user_indices = np.arange(n_users, dtype=np.int64)

    dm = DM()
    results = {}
    best_hyperparams_by_size = {}
    last_best_params = prev_best_params if prev_best_params is not None else None

    # ===== baseline (sample size = 0) using get_trial_results =====
    pi_0 = softmax(our_x_orig @ our_a_orig.T, axis=1)
    original_policy_prob = np.expand_dims(pi_0, -1)

    simulation_data = create_simulation_data_from_pi(
        dataset, pi_0, val_size, random_state=0
    )

    # use same data for train/val just to generate the baseline row
    train_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x_orig)
    val_data   = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x_orig)
    t0 = time.time()
    regression_model = RegressionModel(
        n_actions=n_actions, action_context=our_x_orig,
        base_model=LogisticRegression(random_state=12345)
    )

    regression_model.fit(train_data['x'], train_data['a'], train_data['r'])
    print(f"Baseline regression model fit time: {time.time() - t0} seconds")

    t0 = time.time()
    neighberhoodmodel = NeighborhoodModel(
        train_data['x_idx'], train_data['a'],
        our_a_orig, our_x_orig, train_data['r'],
        num_neighbors=num_neighbors
    )
    print(f"Baseline neighborhood model fit time: {time.time() - t0} seconds")
    
    # baseline row produced via get_trial_results
    results[0] = get_trial_results(
        our_x_orig, our_a_orig, emb_x, emb_a, original_x, original_a,
        dataset, val_data, original_policy_prob,
        neighberhoodmodel, regression_model, dm
    )

    # ===== main loop over training sizes =====
    for train_size in train_sizes:

        # we’ll collect per-run trial dicts generated by get_trial_results
        trial_dicts_this_size = []
        best_hyperparams_by_size[train_size] = {}

        # --- prepare a resampling for Optuna’s objective (shared loaders built per-run inside objective) ---
        # We’ll do Optuna per-run (fresh resample + search), then final fit with best params, then get_trial_results.

        for run in range(num_runs):

            # --- resample for this run ---
            pi_0 = softmax(our_x_orig @ our_a_orig.T, axis=1)
            original_policy_prob = np.expand_dims(pi_0, -1)

            simulation_data = create_simulation_data_from_pi(
                dataset, pi_0, train_size + val_size,
                random_state=(run + 1) * (train_size + 17)
            )
                        
            idx_train = np.arange(train_size)
            train_data = get_train_data(n_actions, train_size, simulation_data, idx_train, our_x_orig)
            val_idx   = np.arange(val_size) + train_size
            val_data  = get_train_data(n_actions, val_size, simulation_data, val_idx, our_x_orig)

            num_workers = 4 if torch.cuda.is_available() else 0

            cf_dataset = CustomCFDataset(
                train_data['x_idx'], train_data['a'], train_data['r'], original_policy_prob
            )

            # val_loader = DataLoader(
            #     val_dataset, batch_size=val_size, shuffle=False,
            #     pin_memory=torch.cuda.is_available(),
            #     num_workers=num_workers, persistent_workers=bool(num_workers)
            # )


            # --- Optuna objective bound to this run's data ---
            def objective(trial):
                print()
                print(f"Trial {trial.number} started")
                lr = trial.suggest_float("lr", 1e-4, 1e-1, log=True)
                epochs = trial.suggest_int("num_epochs", 1, 10)
                trial_batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
                trial_num_neighbors = trial.suggest_int("num_neighbors", 3, 15)
                lr_decay = trial.suggest_float("lr_decay", 0.8, 1.0)

                trial_neigh_model = NeighborhoodModel(
                    train_data['x_idx'], train_data['a'],
                    our_a_orig, our_x_orig, train_data['r'],
                    num_neighbors=trial_num_neighbors
                )

                trial_scores_all = torch.as_tensor(
                    trial_neigh_model.predict(all_user_indices),
                    device=device, dtype=torch.float32
                )

                trial_model = LinearCFModel(
                    n_users, n_actions, emb_dim,
                    initial_user_embeddings=T(our_x_orig),
                    initial_actions_embeddings=T(our_a_orig)
                ).to(device)

                assert (not torch.cuda.is_available()) or next(trial_model.parameters()).is_cuda

                final_train_loader = DataLoader(
                    cf_dataset, batch_size=trial_batch_size, shuffle=True,
                    pin_memory=torch.cuda.is_available(),
                    num_workers=num_workers, persistent_workers=bool(num_workers)
                )

                current_lr = lr
                for epoch in range(epochs):
                    if epoch > 0:
                        current_lr *= lr_decay
                        
                    train(
                        trial_model, final_train_loader, trial_scores_all,
                        criterion=KLPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                    )

                trial_x, trial_a = trial_model.get_params()
                trial_x = trial_x.detach().cpu().numpy()
                trial_a = trial_a.detach().cpu().numpy()

                pi_i = softmax(trial_x @ trial_a.T, axis=1)
                train_actions = train_data['a']
                train_users = train_data['x_idx']

                print("Train wi info: {}".format(get_weights_info(pi_i[train_users, train_actions], original_policy_prob[train_users, train_actions])))
                print(f"actual reward: {calc_reward(dataset, np.expand_dims(pi_i, -1))}")

                # print(get_weights_info(pi_i, original_policy_prob))
                # validation reward for selection
                return cv_score_model(val_data, trial_scores_all, pi_i)


            # --- run Optuna for this run ---
            study = optuna.create_study(direction="maximize")
            
            if last_best_params is not None:
                study.enqueue_trial(last_best_params)

            study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

            best_params = study.best_params
            last_best_params = best_params  # optional warm-start to next run
            best_hyperparams_by_size[train_size][run] = {
                "params": best_params,
                "reward": study.best_value
            }


            # --- final training with best params on this run’s data ---
            regression_model = RegressionModel(
                n_actions=n_actions, action_context=our_x_orig,
                base_model=LogisticRegression(random_state=12345)
            )
            regression_model.fit(
                train_data['x'], train_data['a'], train_data['r'],
                original_policy_prob[train_data['x_idx'], train_data['a']].squeeze()
            )

            neighberhoodmodel = NeighborhoodModel(
                train_data['x_idx'], train_data['a'],
                our_a_orig, our_x_orig, train_data['r'],
                num_neighbors=best_params['num_neighbors']
            )
            scores_all = torch.as_tensor(
                neighberhoodmodel.predict(all_user_indices),
                device=device, dtype=torch.float32
            )

            model = LinearCFModel(
                n_users, n_actions, emb_dim,
                initial_user_embeddings=T(our_x_orig),
                initial_actions_embeddings=T(our_a_orig)
            ).to(device)
            assert (not torch.cuda.is_available()) or next(model.parameters()).is_cuda

            train_loader = DataLoader(
                cf_dataset, batch_size=batch_size, shuffle=True,
                pin_memory=torch.cuda.is_available(),
                num_workers=num_workers, persistent_workers=bool(num_workers)
            )

            current_lr = best_params['lr']
            for epoch in range(best_params['num_epochs']):
                if epoch > 0:
                    current_lr *= best_params['lr_decay']
                train(
                    model, train_loader, scores_all,
                    criterion=KLPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                )

            # learned embeddings (do NOT overwrite originals)
            learned_x_t, learned_a_t = model.get_params()
            learned_x = learned_x_t.detach().cpu().numpy()
            learned_a = learned_a_t.detach().cpu().numpy()

            # --- produce the per-run result via get_trial_results ---
            trial_res = get_trial_results(
                learned_x, learned_a,          # learned (policy) embeddings
                emb_x, emb_a,                  # ground-truth embedding refs
                original_x, original_a,        # original clean refs
                dataset,
                val_data,                      # use this run's val split
                original_policy_prob,
                neighberhoodmodel,
                regression_model,
                dm
            )

            trial_dicts_this_size.append(trial_res)

            # memory hygiene
            torch.cuda.empty_cache()

        # === aggregate per-run results (mean) and store under this train_size ===
        results[train_size] = _mean_dict(trial_dicts_this_size)

    return pd.DataFrame.from_dict(results, orient='index'), best_hyperparams_by_size

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [4]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params)

Random Item CTR: 0.07066414727263938
Optimal greedy CTR: 0.09999926940951757
Second Best greedy CTR: 0.0980913477695915
Optimal Stochastic CTR: 0.09995326955796031
second Best Stochastic CTR: 0.08595012935428775
Our Initial CTR: 0.08610747363354625


In [5]:
num_runs = 1
batch_size = 200
num_neighbors = 6
n_trials_for_optuna = 20
# num_rounds_list = [500, 1000, 2000, 10000, 20000]
# num_rounds_list = [500, 1000, 2000]
num_rounds_list = [15000]


# Manually define your best parameters
best_params_to_use = {
    "lr": 0.096,  # Learning rate
    "num_epochs": 5,  # Number of training epochs
    "batch_size": 64,  # Batch size for training
    "num_neighbors": 8,  # Number of neighbors for neighborhood model
    "lr_decay": 0.85  # Learning rate decay factor
}

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [6]:
print("Value of num_rounds_list:", num_rounds_list)

# Run the optimization
df4, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# # Print best hyperparameters for each training size
# print("\n=== BEST HYPERPARAMETERS BY TRAINING SIZE ===")
# for train_size, params in best_hyperparams_by_size.items():
#     print(f"\nTraining Size: {train_size}")
#     # print(f"Best Reward: {params['reward']:.6f}")
#     print("Parameters:")
#     for param_name, value in params['params'].items():
#         print(f"  {param_name}: {value}")
# print("===========================\n")

# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Value of num_rounds_list: [15000]
Simulation time for 10000 samples: 0.026812314987182617 seconds
Baseline regression model fit time: 0.08083295822143555 seconds
Baseline neighborhood model fit time: 29.70861315727234 seconds
Num samples is 10000
{'gini': np.float64(0.48357763099887546), 'ess': np.float64(3891.817348806044), 'max_wi': np.float64(45.00658812185537), 'min_wi': np.float64(0.009939141556591315)}
Eval time: 0.14032912254333496 seconds


[I 2025-11-02 21:16:12,715] A new study created in memory with name: no-name-092da759-c53a-4fbb-ba31-23320c36d88f


Evaluation total results time: 0.48679256439208984 seconds
Simulation time for 25000 samples: 0.05918002128601074 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0740002:   5%|▌         | 1/20 [00:48<15:20, 48.47s/it]

Train wi info: {'gini': np.float64(0.9793971738866526), 'ess': np.float64(333.372630446925), 'max_wi': np.float64(125.15212075584832), 'min_wi': np.float64(0.0)}
actual reward: [0.07925007]
{'gini': np.float64(0.9819466106136256), 'ess': np.float64(188.01820581525834), 'max_wi': np.float64(150.62402556631577), 'min_wi': np.float64(0.0)}
Estimated reward: 0.085377
Cross-validated error: 0.005688
Final score CI (reward +- 2*error): [0.074000, 0.096753]
Standard error: 0.018000
Final t_dist CI (reward +- t_0.975*se_hat): [0.050093, 0.120661]
[I 2025-11-02 21:17:01,179] Trial 0 finished with value: 0.07400019079960082 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.07400019079960082.

Trial 1 started


Best trial: 1. Best value: 0.0760622:  10%|█         | 2/20 [01:34<14:08, 47.14s/it]

Train wi info: {'gini': np.float64(0.17737837615189694), 'ess': np.float64(13357.798808032261), 'max_wi': np.float64(2.1064336578503227), 'min_wi': np.float64(0.45396182221710724)}
actual reward: [0.08622358]
{'gini': np.float64(0.17390570639824215), 'ess': np.float64(8953.512047798331), 'max_wi': np.float64(2.104711195715754), 'min_wi': np.float64(0.45590996812252915)}
Estimated reward: 0.078081
Cross-validated error: 0.001009
Final score CI (reward +- 2*error): [0.076062, 0.080100]
Standard error: 0.003026
Final t_dist CI (reward +- t_0.975*se_hat): [0.072149, 0.084013]
[I 2025-11-02 21:17:47,402] Trial 1 finished with value: 0.07606224650733553 and parameters: {'lr': 0.000203147885120411, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 9, 'lr_decay': 0.970276082458785}. Best is trial 1 with value: 0.07606224650733553.

Trial 2 started


Best trial: 1. Best value: 0.0760622:  15%|█▌        | 3/20 [02:19<13:03, 46.09s/it]

Train wi info: {'gini': np.float64(0.994884652325549), 'ess': np.float64(29.806097584959147), 'max_wi': np.float64(679.9835476828017), 'min_wi': np.float64(6.03114702568029e-10)}
actual reward: [0.08584946]
{'gini': np.float64(0.996525943891357), 'ess': np.float64(13.075501523959316), 'max_wi': np.float64(1359.445831904793), 'min_wi': np.float64(7.836901796404198e-10)}
Estimated reward: 0.087901
Cross-validated error: 0.009150
Final score CI (reward +- 2*error): [0.069602, 0.106201]
Standard error: 0.017237
Final t_dist CI (reward +- t_0.975*se_hat): [0.054112, 0.121690]
[I 2025-11-02 21:18:32,243] Trial 2 finished with value: 0.06960166570328688 and parameters: {'lr': 0.0318992370553059, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.8645228111513739}. Best is trial 1 with value: 0.07606224650733553.

Trial 3 started


Best trial: 3. Best value: 0.0785964:  20%|██        | 4/20 [03:04<12:11, 45.74s/it]

Train wi info: {'gini': np.float64(0.9915221464712138), 'ess': np.float64(45.21605404624261), 'max_wi': np.float64(645.2898360659332), 'min_wi': np.float64(8.456532573302024e-19)}
actual reward: [0.08711094]
{'gini': np.float64(0.9954417441538072), 'ess': np.float64(10.349177273337641), 'max_wi': np.float64(1540.2565934275026), 'min_wi': np.float64(4.105368279879019e-19)}
Estimated reward: 0.096887
Cross-validated error: 0.009145
Final score CI (reward +- 2*error): [0.078596, 0.115177]
Standard error: 0.016766
Final t_dist CI (reward +- t_0.975*se_hat): [0.064022, 0.129751]
[I 2025-11-02 21:19:17,446] Trial 3 finished with value: 0.07859638752023185 and parameters: {'lr': 0.04375677880773217, 'num_epochs': 5, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.8839056429392667}. Best is trial 3 with value: 0.07859638752023185.

Trial 4 started


Best trial: 4. Best value: 0.0860819:  25%|██▌       | 5/20 [04:00<12:21, 49.45s/it]

Train wi info: {'gini': np.float64(0.8321411374910748), 'ess': np.float64(1948.9526736273183), 'max_wi': np.float64(20.074169832157878), 'min_wi': np.float64(0.0017336741100679538)}
actual reward: [0.08762275]
{'gini': np.float64(0.8197969651054634), 'ess': np.float64(1421.7790200053373), 'max_wi': np.float64(20.074169832157878), 'min_wi': np.float64(0.0018656370927198221)}
Estimated reward: 0.091574
Cross-validated error: 0.002746
Final score CI (reward +- 2*error): [0.086082, 0.097066]
Standard error: 0.008118
Final t_dist CI (reward +- t_0.975*se_hat): [0.075661, 0.107487]
[I 2025-11-02 21:20:13,481] Trial 4 finished with value: 0.08608192327610659 and parameters: {'lr': 0.0038095991793125686, 'num_epochs': 4, 'batch_size': 256, 'num_neighbors': 14, 'lr_decay': 0.8385943194620185}. Best is trial 4 with value: 0.08608192327610659.

Trial 5 started


Best trial: 4. Best value: 0.0860819:  30%|███       | 6/20 [04:58<12:10, 52.17s/it]

Train wi info: {'gini': np.float64(0.9792978199813509), 'ess': np.float64(333.23011760768924), 'max_wi': np.float64(138.11334554209856), 'min_wi': np.float64(1.603016110565981e-38)}
actual reward: [0.08438878]
{'gini': np.float64(0.9938466602374377), 'ess': np.float64(3.7493102746049263), 'max_wi': np.float64(9905.614109242191), 'min_wi': np.float64(5.762679709468384e-36)}
Estimated reward: 0.039368
Cross-validated error: 0.005162
Final score CI (reward +- 2*error): [0.029043, 0.049693]
Standard error: 0.035687
Final t_dist CI (reward +- t_0.975*se_hat): [-0.030586, 0.109322]
[I 2025-11-02 21:21:10,928] Trial 5 finished with value: 0.0290428186608249 and parameters: {'lr': 0.08389984051920549, 'num_epochs': 2, 'batch_size': 64, 'num_neighbors': 9, 'lr_decay': 0.8630414106495686}. Best is trial 4 with value: 0.08608192327610659.

Trial 6 started


Best trial: 4. Best value: 0.0860819:  35%|███▌      | 7/20 [05:53<11:32, 53.28s/it]

Train wi info: {'gini': np.float64(0.08834247912355642), 'ess': np.float64(14573.470289752313), 'max_wi': np.float64(1.4710837331232836), 'min_wi': np.float64(0.7019665857425947)}
actual reward: [0.08616383]
{'gini': np.float64(0.08632367448279447), 'ess': np.float64(9730.78667682155), 'max_wi': np.float64(1.4769405394784691), 'min_wi': np.float64(0.697721997889819)}
Estimated reward: 0.077661
Cross-validated error: 0.000921
Final score CI (reward +- 2*error): [0.075819, 0.079504]
Standard error: 0.002839
Final t_dist CI (reward +- t_0.975*se_hat): [0.072097, 0.083225]
[I 2025-11-02 21:22:06,504] Trial 6 finished with value: 0.07581874439745225 and parameters: {'lr': 0.0004379014604202074, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 14, 'lr_decay': 0.9244740028115817}. Best is trial 4 with value: 0.08608192327610659.

Trial 7 started


Best trial: 4. Best value: 0.0860819:  40%|████      | 8/20 [06:45<10:32, 52.72s/it]

Train wi info: {'gini': np.float64(0.3204623448593753), 'ess': np.float64(10418.865767399504), 'max_wi': np.float64(3.441624392201152), 'min_wi': np.float64(0.2472878013334334)}
actual reward: [0.08635584]
{'gini': np.float64(0.3133846240422901), 'ess': np.float64(7079.7898174104575), 'max_wi': np.float64(3.441624392201152), 'min_wi': np.float64(0.22847320199986018)}
Estimated reward: 0.080649
Cross-validated error: 0.001039
Final score CI (reward +- 2*error): [0.078570, 0.082728]
Standard error: 0.003712
Final t_dist CI (reward +- t_0.975*se_hat): [0.073373, 0.087926]
[I 2025-11-02 21:22:58,022] Trial 7 finished with value: 0.07857047596425024 and parameters: {'lr': 0.0017008289115198342, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 4, 'lr_decay': 0.8307738917472164}. Best is trial 4 with value: 0.08608192327610659.

Trial 8 started


Best trial: 4. Best value: 0.0860819:  45%|████▌     | 9/20 [07:35<09:32, 52.01s/it]

Train wi info: {'gini': np.float64(0.06428942429864141), 'ess': np.float64(14773.3208761671), 'max_wi': np.float64(1.343710537198497), 'min_wi': np.float64(0.7647705278599469)}
actual reward: [0.08614781]
{'gini': np.float64(0.06289268776876344), 'ess': np.float64(9856.587807140026), 'max_wi': np.float64(1.343710537198497), 'min_wi': np.float64(0.7756148105679008)}
Estimated reward: 0.078311
Cross-validated error: 0.001038
Final score CI (reward +- 2*error): [0.076235, 0.080388]
Standard error: 0.002930
Final t_dist CI (reward +- t_0.975*se_hat): [0.072569, 0.084054]
[I 2025-11-02 21:23:48,469] Trial 8 finished with value: 0.07623470077689548 and parameters: {'lr': 0.00033379405361638374, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 6, 'lr_decay': 0.8974555152342357}. Best is trial 4 with value: 0.08608192327610659.

Trial 9 started


Best trial: 4. Best value: 0.0860819:  50%|█████     | 10/20 [08:28<08:41, 52.19s/it]

Train wi info: {'gini': np.float64(0.17006807766906587), 'ess': np.float64(13496.24446162227), 'max_wi': np.float64(2.0495981123904805), 'min_wi': np.float64(0.45730164401625684)}
actual reward: [0.08621922]
{'gini': np.float64(0.1667646416876915), 'ess': np.float64(9039.68735213694), 'max_wi': np.float64(2.0495981123904805), 'min_wi': np.float64(0.4440326801624289)}
Estimated reward: 0.077756
Cross-validated error: 0.000827
Final score CI (reward +- 2*error): [0.076101, 0.079411]
Standard error: 0.002964
Final t_dist CI (reward +- t_0.975*se_hat): [0.071947, 0.083565]
[I 2025-11-02 21:24:41,055] Trial 9 finished with value: 0.07610128455551343 and parameters: {'lr': 0.0001420929232713414, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.9329222116673767}. Best is trial 4 with value: 0.08608192327610659.

Trial 10 started


Best trial: 10. Best value: 0.0903253:  55%|█████▌    | 11/20 [09:20<07:49, 52.17s/it]

Train wi info: {'gini': np.float64(0.984680028399322), 'ess': np.float64(51.370476355434164), 'max_wi': np.float64(1489.9293371727756), 'min_wi': np.float64(9.293805204136389e-08)}
actual reward: [0.08746998]
{'gini': np.float64(0.9823908954252903), 'ess': np.float64(34.07950710151196), 'max_wi': np.float64(1106.4624375698663), 'min_wi': np.float64(1.1376803436110621e-07)}
Estimated reward: 0.105869
Cross-validated error: 0.007772
Final score CI (reward +- 2*error): [0.090325, 0.121412]
Standard error: 0.019147
Final t_dist CI (reward +- t_0.975*se_hat): [0.068336, 0.143401]
[I 2025-11-02 21:25:33,179] Trial 10 finished with value: 0.09032526162384989 and parameters: {'lr': 0.008821804503828828, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 15, 'lr_decay': 0.8115243802575336}. Best is trial 10 with value: 0.09032526162384989.

Trial 11 started


Best trial: 10. Best value: 0.0903253:  60%|██████    | 12/20 [10:07<06:44, 50.53s/it]

Train wi info: {'gini': np.float64(0.9625750161964288), 'ess': np.float64(64.64399588645695), 'max_wi': np.float64(1728.6486334344133), 'min_wi': np.float64(5.342873066350013e-06)}
actual reward: [0.0892708]
{'gini': np.float64(0.9542698121731772), 'ess': np.float64(57.444070948526786), 'max_wi': np.float64(1095.7601454436542), 'min_wi': np.float64(6.038385129098937e-06)}
Estimated reward: 0.091621
Cross-validated error: 0.004425
Final score CI (reward +- 2*error): [0.082771, 0.100471]
Standard error: 0.011318
Final t_dist CI (reward +- t_0.975*se_hat): [0.069436, 0.113806]
[I 2025-11-02 21:26:19,955] Trial 11 finished with value: 0.08277107706736264 and parameters: {'lr': 0.006517744229558574, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 15, 'lr_decay': 0.8080669746922542}. Best is trial 10 with value: 0.09032526162384989.

Trial 12 started


Best trial: 10. Best value: 0.0903253:  65%|██████▌   | 13/20 [10:58<05:54, 50.67s/it]

Train wi info: {'gini': np.float64(0.9559061098286505), 'ess': np.float64(82.21423087507164), 'max_wi': np.float64(1481.423118703355), 'min_wi': np.float64(7.67127842317338e-06)}
actual reward: [0.08953504]
{'gini': np.float64(0.9475776321137733), 'ess': np.float64(89.15982565157692), 'max_wi': np.float64(857.1494999146395), 'min_wi': np.float64(8.479727984010711e-06)}
Estimated reward: 0.093867
Cross-validated error: 0.004284
Final score CI (reward +- 2*error): [0.085300, 0.102435]
Standard error: 0.011763
Final t_dist CI (reward +- t_0.975*se_hat): [0.070810, 0.116924]
[I 2025-11-02 21:27:10,941] Trial 12 finished with value: 0.08529975858280348 and parameters: {'lr': 0.007613624839786313, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 12, 'lr_decay': 0.8007005284253748}. Best is trial 10 with value: 0.09032526162384989.

Trial 13 started


Best trial: 10. Best value: 0.0903253:  70%|███████   | 14/20 [11:51<05:07, 51.32s/it]

Train wi info: {'gini': np.float64(0.7779537922382772), 'ess': np.float64(2475.4250089358643), 'max_wi': np.float64(18.481116525269297), 'min_wi': np.float64(0.003675616348882142)}
actual reward: [0.08706712]
{'gini': np.float64(0.7661006919011201), 'ess': np.float64(1783.4722245054952), 'max_wi': np.float64(18.481116525269297), 'min_wi': np.float64(0.0033479311862302246)}
Estimated reward: 0.088632
Cross-validated error: 0.001924
Final score CI (reward +- 2*error): [0.084784, 0.092480]
Standard error: 0.007168
Final t_dist CI (reward +- t_0.975*se_hat): [0.074581, 0.102683]
[I 2025-11-02 21:28:03,783] Trial 13 finished with value: 0.08478370529243177 and parameters: {'lr': 0.001910837271873611, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 12, 'lr_decay': 0.8269787027804186}. Best is trial 10 with value: 0.09032526162384989.

Trial 14 started


Best trial: 10. Best value: 0.0903253:  75%|███████▌  | 15/20 [12:42<04:16, 51.36s/it]

Train wi info: {'gini': np.float64(0.9944547041331975), 'ess': np.float64(5.833238242027258), 'max_wi': np.float64(6161.43280062773), 'min_wi': np.float64(5.781044918559182e-09)}
actual reward: [0.08588057]
{'gini': np.float64(0.9938903037779997), 'ess': np.float64(5.75908359053859), 'max_wi': np.float64(3239.2531051295573), 'min_wi': np.float64(6.03614195845434e-09)}
Estimated reward: 0.096673
Cross-validated error: 0.009233
Final score CI (reward +- 2*error): [0.078208, 0.115138]
Standard error: 0.025631
Final t_dist CI (reward +- t_0.975*se_hat): [0.046431, 0.146916]
[I 2025-11-02 21:28:55,234] Trial 14 finished with value: 0.07820780441234168 and parameters: {'lr': 0.01188331445759872, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 15, 'lr_decay': 0.8340404277290911}. Best is trial 10 with value: 0.09032526162384989.

Trial 15 started


Best trial: 10. Best value: 0.0903253:  80%|████████  | 16/20 [13:33<03:24, 51.16s/it]

Train wi info: {'gini': np.float64(0.11796786429071536), 'ess': np.float64(14258.753407449987), 'max_wi': np.float64(1.7117894782950767), 'min_wi': np.float64(0.6049393660669814)}
actual reward: [0.08618319]
{'gini': np.float64(0.1157383642475569), 'ess': np.float64(9526.422562711632), 'max_wi': np.float64(1.7117894782950767), 'min_wi': np.float64(0.6066937044982997)}
Estimated reward: 0.077708
Cross-validated error: 0.000815
Final score CI (reward +- 2*error): [0.076078, 0.079338]
Standard error: 0.002896
Final t_dist CI (reward +- t_0.975*se_hat): [0.072030, 0.083385]
[I 2025-11-02 21:29:45,936] Trial 15 finished with value: 0.07607757714275123 and parameters: {'lr': 0.0009708034538829974, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 11, 'lr_decay': 0.9955499363698681}. Best is trial 10 with value: 0.09032526162384989.

Trial 16 started


Best trial: 10. Best value: 0.0903253:  85%|████████▌ | 17/20 [14:24<02:33, 51.19s/it]

Train wi info: {'gini': np.float64(0.8656269189938661), 'ess': np.float64(1628.4545080724986), 'max_wi': np.float64(34.124882440271), 'min_wi': np.float64(0.00043597658473810464)}
actual reward: [0.08861768]
{'gini': np.float64(0.8531686554929504), 'ess': np.float64(1184.4779993971056), 'max_wi': np.float64(42.04804657885498), 'min_wi': np.float64(0.00042482743267548216)}
Estimated reward: 0.092442
Cross-validated error: 0.002690
Final score CI (reward +- 2*error): [0.087062, 0.097821]
Standard error: 0.008662
Final t_dist CI (reward +- t_0.975*se_hat): [0.075463, 0.109421]
[I 2025-11-02 21:30:37,198] Trial 16 finished with value: 0.08706245406087569 and parameters: {'lr': 0.003765668201688345, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.8140981105960752}. Best is trial 10 with value: 0.09032526162384989.

Trial 17 started


Best trial: 10. Best value: 0.0903253:  90%|█████████ | 18/20 [15:15<01:42, 51.23s/it]

Train wi info: {'gini': np.float64(0.99535360336638), 'ess': np.float64(19.50482599455688), 'max_wi': np.float64(1673.1444325066998), 'min_wi': np.float64(1.1172191946721006e-10)}
actual reward: [0.08642941]
{'gini': np.float64(0.9963948508417728), 'ess': np.float64(8.540393468055507), 'max_wi': np.float64(2165.9818701670315), 'min_wi': np.float64(1.6258193812858966e-10)}
Estimated reward: 0.086579
Cross-validated error: 0.007764
Final score CI (reward +- 2*error): [0.071051, 0.102108]
Standard error: 0.020005
Final t_dist CI (reward +- t_0.975*se_hat): [0.047365, 0.125794]
[I 2025-11-02 21:31:28,530] Trial 17 finished with value: 0.07105078535399333 and parameters: {'lr': 0.01693052907889805, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.8108819949203055}. Best is trial 10 with value: 0.09032526162384989.

Trial 18 started


Best trial: 18. Best value: 0.0931721:  95%|█████████▌| 19/20 [16:08<00:51, 51.67s/it]

Train wi info: {'gini': np.float64(0.9518297338774782), 'ess': np.float64(43.46950616018178), 'max_wi': np.float64(2242.0391936677065), 'min_wi': np.float64(4.324676004668218e-06)}
actual reward: [0.08943429]
{'gini': np.float64(0.9424889824845832), 'ess': np.float64(35.03038279707777), 'max_wi': np.float64(1572.6067255187463), 'min_wi': np.float64(4.324676004668218e-06)}
Estimated reward: 0.104737
Cross-validated error: 0.005783
Final score CI (reward +- 2*error): [0.093172, 0.116303]
Standard error: 0.011879
Final t_dist CI (reward +- t_0.975*se_hat): [0.081453, 0.128022]
[I 2025-11-02 21:32:21,210] Trial 18 finished with value: 0.0931720892674161 and parameters: {'lr': 0.00308032468803497, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 15, 'lr_decay': 0.9230119525572414}. Best is trial 18 with value: 0.0931720892674161.

Trial 19 started


Best trial: 18. Best value: 0.0931721: 100%|██████████| 20/20 [17:01<00:00, 51.06s/it]

Train wi info: {'gini': np.float64(0.776764947013987), 'ess': np.float64(2561.3302782660808), 'max_wi': np.float64(21.24674187623402), 'min_wi': np.float64(0.0030063969800626865)}
actual reward: [0.08716433]
{'gini': np.float64(0.765499305841617), 'ess': np.float64(1825.685461694621), 'max_wi': np.float64(23.615959658617033), 'min_wi': np.float64(0.0032950065787286445)}
Estimated reward: 0.087751
Cross-validated error: 0.002122
Final score CI (reward +- 2*error): [0.083507, 0.091996]
Standard error: 0.006920
Final t_dist CI (reward +- t_0.975*se_hat): [0.074187, 0.101315]
[I 2025-11-02 21:33:13,919] Trial 19 finished with value: 0.08350693662902288 and parameters: {'lr': 0.0008971223503204559, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 15, 'lr_decay': 0.9312294812405533}. Best is trial 18 with value: 0.0931720892674161.





Num samples is 10000
{'gini': np.float64(0.9360595847771348), 'ess': np.float64(307.40500003324627), 'max_wi': np.float64(185.78904343050002), 'min_wi': np.float64(2.0316160421014675e-05)}
Eval time: 0.11384701728820801 seconds
Evaluation total results time: 0.4126303195953369 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0858,0.08587958,0.09146488,0.0870202,0.0870202,0.7569287,0.0,0.87627132,0.0
15000,0.08986485,0.08938016,0.08942195,0.09146836,0.09230034,0.09230175,0.84992686,0.24763566,0.92641604,0.15059657


### Policy with delta function

In [7]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=10000)

Random Item CTR: 0.07083863592474163
Optimal greedy CTR: 0.09999916436977967
Second Best greedy CTR: 0.08797326118616329
Optimal Stochastic CTR: 0.0999493542444427
second Best Stochastic CTR: 0.0854530317781557
Our Initial CTR: 0.08557719469284641


In [8]:
# Run the optimization
df5, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df5[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.026820898056030273 seconds
Baseline regression model fit time: 0.08495235443115234 seconds
Baseline neighborhood model fit time: 30.432530641555786 seconds
Num samples is 10000
{'gini': np.float64(0.4624242051067587), 'ess': np.float64(4432.5227373596235), 'max_wi': np.float64(37.468396402411294), 'min_wi': np.float64(0.02084098835003422)}
Eval time: 0.14079689979553223 seconds


[I 2025-11-02 21:34:30,448] A new study created in memory with name: no-name-6a0fbb1a-5c60-4c6d-bfdd-51e0b4be8e5f


Evaluation total results time: 0.4810309410095215 seconds
Simulation time for 25000 samples: 0.061788320541381836 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0662385:   5%|▌         | 1/20 [00:47<15:09, 47.86s/it]

Train wi info: {'gini': np.float64(0.9741118598542415), 'ess': np.float64(32.515198013231355), 'max_wi': np.float64(2132.968547441097), 'min_wi': np.float64(0.0)}
actual reward: [0.07994104]
{'gini': np.float64(0.9765372229049938), 'ess': np.float64(15.359033689403239), 'max_wi': np.float64(2725.919317878957), 'min_wi': np.float64(0.0)}
Estimated reward: 0.073515
Cross-validated error: 0.003638
Final score CI (reward +- 2*error): [0.066238, 0.080792]
Standard error: 0.009834
Final t_dist CI (reward +- t_0.975*se_hat): [0.054240, 0.092791]
[I 2025-11-02 21:35:18,310] Trial 0 finished with value: 0.06623846768464366 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.06623846768464366.

Trial 1 started


Best trial: 0. Best value: 0.0662385:  10%|█         | 2/20 [01:37<14:38, 48.78s/it]

Train wi info: {'gini': np.float64(0.994920152832203), 'ess': np.float64(18.562521671195), 'max_wi': np.float64(1811.8910310149515), 'min_wi': np.float64(9.232650843605447e-09)}
actual reward: [0.08546245]
{'gini': np.float64(0.9941084868294919), 'ess': np.float64(20.667946770271882), 'max_wi': np.float64(1436.4340685930088), 'min_wi': np.float64(1.4854185463376623e-08)}
Estimated reward: 0.054560
Cross-validated error: 0.006333
Final score CI (reward +- 2*error): [0.041893, 0.067227]
Standard error: 0.027200
Final t_dist CI (reward +- t_0.975*se_hat): [0.001243, 0.107877]
[I 2025-11-02 21:36:07,736] Trial 1 finished with value: 0.04189317862103655 and parameters: {'lr': 0.012414590683834518, 'num_epochs': 4, 'batch_size': 256, 'num_neighbors': 15, 'lr_decay': 0.8202913674400782}. Best is trial 0 with value: 0.06623846768464366.

Trial 2 started


Best trial: 2. Best value: 0.0768703:  15%|█▌        | 3/20 [02:25<13:43, 48.44s/it]

Train wi info: {'gini': np.float64(0.8923664974924475), 'ess': np.float64(1436.639366573889), 'max_wi': np.float64(28.416994866161563), 'min_wi': np.float64(0.00024537317456320055)}
actual reward: [0.08477533]
{'gini': np.float64(0.8834420999114245), 'ess': np.float64(1049.7280204981807), 'max_wi': np.float64(35.73444537875986), 'min_wi': np.float64(0.00017664284487497003)}
Estimated reward: 0.081796
Cross-validated error: 0.002463
Final score CI (reward +- 2*error): [0.076870, 0.086722]
Standard error: 0.008729
Final t_dist CI (reward +- t_0.975*se_hat): [0.064685, 0.098908]
[I 2025-11-02 21:36:55,773] Trial 2 finished with value: 0.07687025565318523 and parameters: {'lr': 0.012971198623181187, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.8483220528282325}. Best is trial 2 with value: 0.07687025565318523.

Trial 3 started


Best trial: 3. Best value: 0.0769814:  20%|██        | 4/20 [03:18<13:26, 50.41s/it]

Train wi info: {'gini': np.float64(0.9982351430584516), 'ess': np.float64(6.402259782520546), 'max_wi': np.float64(3075.14270329811), 'min_wi': np.float64(1.3663119599950197e-23)}
actual reward: [0.08194345]
{'gini': np.float64(0.9971167624669582), 'ess': np.float64(9.361803002322326), 'max_wi': np.float64(1117.125149705426), 'min_wi': np.float64(5.356499165792386e-23)}
Estimated reward: 0.096639
Cross-validated error: 0.009829
Final score CI (reward +- 2*error): [0.076981, 0.116297]
Standard error: 0.014714
Final t_dist CI (reward +- t_0.975*se_hat): [0.067796, 0.125483]
[I 2025-11-02 21:37:49,211] Trial 3 finished with value: 0.07698138827524716 and parameters: {'lr': 0.08944063333621789, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.8536708390534476}. Best is trial 3 with value: 0.07698138827524716.

Trial 4 started


Best trial: 3. Best value: 0.0769814:  25%|██▌       | 5/20 [04:08<12:32, 50.16s/it]

Train wi info: {'gini': np.float64(0.06427138590689575), 'ess': np.float64(14765.075816581733), 'max_wi': np.float64(1.3894150000042884), 'min_wi': np.float64(0.7591490334165432)}
actual reward: [0.0854956]
{'gini': np.float64(0.06466170137553622), 'ess': np.float64(9844.309108460395), 'max_wi': np.float64(1.3935132708670206), 'min_wi': np.float64(0.7597789826058131)}
Estimated reward: 0.074110
Cross-validated error: 0.000837
Final score CI (reward +- 2*error): [0.072436, 0.075785]
Standard error: 0.003140
Final t_dist CI (reward +- t_0.975*se_hat): [0.067955, 0.080266]
[I 2025-11-02 21:38:38,933] Trial 4 finished with value: 0.0724360018237279 and parameters: {'lr': 0.00012635037242523983, 'num_epochs': 4, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.9088228855294687}. Best is trial 3 with value: 0.07698138827524716.

Trial 5 started


Best trial: 3. Best value: 0.0769814:  30%|███       | 6/20 [05:00<11:52, 50.91s/it]

Train wi info: {'gini': np.float64(0.9952175622922212), 'ess': np.float64(12.834813704413847), 'max_wi': np.float64(2639.7505631454533), 'min_wi': np.float64(1.2197028889990023e-15)}
actual reward: [0.08858014]
{'gini': np.float64(0.9962343959644048), 'ess': np.float64(11.104356754959747), 'max_wi': np.float64(2785.3111864339257), 'min_wi': np.float64(1.2197028889990023e-15)}
Estimated reward: 0.069735
Cross-validated error: 0.007524
Final score CI (reward +- 2*error): [0.054688, 0.084782]
Standard error: 0.021007
Final t_dist CI (reward +- t_0.975*se_hat): [0.028557, 0.110914]
[I 2025-11-02 21:39:31,282] Trial 5 finished with value: 0.05468810793089637 and parameters: {'lr': 0.018122974317976333, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 5, 'lr_decay': 0.9288531461193446}. Best is trial 3 with value: 0.07698138827524716.

Trial 6 started


Best trial: 3. Best value: 0.0769814:  35%|███▌      | 7/20 [05:52<11:03, 51.01s/it]

Train wi info: {'gini': np.float64(0.14706489663073102), 'ess': np.float64(13768.056387931307), 'max_wi': np.float64(2.034514853308525), 'min_wi': np.float64(0.5324715211074968)}
actual reward: [0.08537136]
{'gini': np.float64(0.14750446157155786), 'ess': np.float64(9194.03961422289), 'max_wi': np.float64(1.9988674347771251), 'min_wi': np.float64(0.5343635528912245)}
Estimated reward: 0.074964
Cross-validated error: 0.000957
Final score CI (reward +- 2*error): [0.073051, 0.076878]
Standard error: 0.003115
Final t_dist CI (reward +- t_0.975*se_hat): [0.068858, 0.081071]
[I 2025-11-02 21:40:22,491] Trial 6 finished with value: 0.07305132763092684 and parameters: {'lr': 0.000328185716418179, 'num_epochs': 8, 'batch_size': 512, 'num_neighbors': 4, 'lr_decay': 0.8652934669395477}. Best is trial 3 with value: 0.07698138827524716.

Trial 7 started


Best trial: 3. Best value: 0.0769814:  40%|████      | 8/20 [06:42<10:11, 50.94s/it]

Train wi info: {'gini': np.float64(0.18715796902216897), 'ess': np.float64(13044.747581859248), 'max_wi': np.float64(2.392855254826064), 'min_wi': np.float64(0.45042031999947924)}
actual reward: [0.08531359]
{'gini': np.float64(0.18765335774317363), 'ess': np.float64(8720.790901493798), 'max_wi': np.float64(2.392855254826064), 'min_wi': np.float64(0.4372296530331499)}
Estimated reward: 0.076468
Cross-validated error: 0.000819
Final score CI (reward +- 2*error): [0.074830, 0.078107]
Standard error: 0.002959
Final t_dist CI (reward +- t_0.975*se_hat): [0.070667, 0.082269]
[I 2025-11-02 21:41:13,282] Trial 7 finished with value: 0.07482972734245831 and parameters: {'lr': 0.0005861996752465763, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 13, 'lr_decay': 0.9264110198732464}. Best is trial 3 with value: 0.07698138827524716.

Trial 8 started


Best trial: 3. Best value: 0.0769814:  45%|████▌     | 9/20 [07:34<09:22, 51.15s/it]

Train wi info: {'gini': np.float64(0.980233186425865), 'ess': np.float64(73.26405920371934), 'max_wi': np.float64(532.0221136425058), 'min_wi': np.float64(6.880388484252633e-07)}
actual reward: [0.0875046]
{'gini': np.float64(0.9784461850924151), 'ess': np.float64(75.76451241374721), 'max_wi': np.float64(519.9745858291391), 'min_wi': np.float64(6.880388484252633e-07)}
Estimated reward: 0.085702
Cross-validated error: 0.007283
Final score CI (reward +- 2*error): [0.071137, 0.100267]
Standard error: 0.024872
Final t_dist CI (reward +- t_0.975*se_hat): [0.036948, 0.134456]
[I 2025-11-02 21:42:04,908] Trial 8 finished with value: 0.07113680447142069 and parameters: {'lr': 0.004774991162107264, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 8, 'lr_decay': 0.9878899108575975}. Best is trial 3 with value: 0.07698138827524716.

Trial 9 started


Best trial: 3. Best value: 0.0769814:  50%|█████     | 10/20 [08:24<08:26, 50.68s/it]

Train wi info: {'gini': np.float64(0.8224567525053781), 'ess': np.float64(1821.5873242198036), 'max_wi': np.float64(17.267396102524227), 'min_wi': np.float64(0.001944757209403515)}
actual reward: [0.08204495]
{'gini': np.float64(0.8186353896770724), 'ess': np.float64(1283.4230670856418), 'max_wi': np.float64(24.83730697468385), 'min_wi': np.float64(0.00205334842322096)}
Estimated reward: 0.078891
Cross-validated error: 0.002384
Final score CI (reward +- 2*error): [0.074122, 0.083660]
Standard error: 0.008357
Final t_dist CI (reward +- t_0.975*se_hat): [0.062510, 0.095271]
[I 2025-11-02 21:42:54,538] Trial 9 finished with value: 0.07412182285931056 and parameters: {'lr': 0.0036019166263182075, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 4, 'lr_decay': 0.9849428566372112}. Best is trial 3 with value: 0.07698138827524716.

Trial 10 started


Best trial: 3. Best value: 0.0769814:  55%|█████▌    | 11/20 [09:15<07:37, 50.83s/it]

Train wi info: {'gini': np.float64(0.9918736942150935), 'ess': np.float64(2.897133923700074), 'max_wi': np.float64(7619.410770367967), 'min_wi': np.float64(4.788692230239379e-39)}
actual reward: [0.08114858]
{'gini': np.float64(0.9929558841813472), 'ess': np.float64(9.650046122026904), 'max_wi': np.float64(1646.069492634317), 'min_wi': np.float64(6.871564940850039e-39)}
Estimated reward: 0.049963
Cross-validated error: 0.004322
Final score CI (reward +- 2*error): [0.041319, 0.058607]
Standard error: 0.019015
Final t_dist CI (reward +- t_0.975*se_hat): [0.012690, 0.087236]
[I 2025-11-02 21:43:45,698] Trial 10 finished with value: 0.04131945868667698 and parameters: {'lr': 0.09330244676963051, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 12, 'lr_decay': 0.8824197245279752}. Best is trial 3 with value: 0.07698138827524716.

Trial 11 started


Best trial: 3. Best value: 0.0769814:  60%|██████    | 12/20 [10:06<06:47, 50.95s/it]

Train wi info: {'gini': np.float64(0.9160111537566022), 'ess': np.float64(1184.0131279873235), 'max_wi': np.float64(27.703065148627005), 'min_wi': np.float64(8.122705939416716e-05)}
actual reward: [0.07993098]
{'gini': np.float64(0.909884442577868), 'ess': np.float64(851.160773234211), 'max_wi': np.float64(29.73973548936245), 'min_wi': np.float64(8.821571738712259e-05)}
Estimated reward: 0.076356
Cross-validated error: 0.002968
Final score CI (reward +- 2*error): [0.070419, 0.082292]
Standard error: 0.009349
Final t_dist CI (reward +- t_0.975*se_hat): [0.058030, 0.094681]
[I 2025-11-02 21:44:36,921] Trial 11 finished with value: 0.07041935659865878 and parameters: {'lr': 0.021994368937810767, 'num_epochs': 1, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.8006999392369891}. Best is trial 3 with value: 0.07698138827524716.

Trial 12 started


Best trial: 3. Best value: 0.0769814:  65%|██████▌   | 13/20 [10:51<05:44, 49.15s/it]

Train wi info: {'gini': np.float64(0.9985946291431608), 'ess': np.float64(8.765639469362204), 'max_wi': np.float64(2404.176070468512), 'min_wi': np.float64(2.866938958660025e-12)}
actual reward: [0.08028875]
{'gini': np.float64(0.9991588061348201), 'ess': np.float64(4.284868049882274), 'max_wi': np.float64(7236.557973207273), 'min_wi': np.float64(2.866938958660025e-12)}
Estimated reward: 0.024080
Cross-validated error: 0.003634
Final score CI (reward +- 2*error): [0.016812, 0.031348]
Standard error: 0.024696
Final t_dist CI (reward +- t_0.975*se_hat): [-0.024329, 0.072490]
[I 2025-11-02 21:45:21,931] Trial 12 finished with value: 0.01681209876025308 and parameters: {'lr': 0.028324199583789803, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.8385897631279723}. Best is trial 3 with value: 0.07698138827524716.

Trial 13 started


Best trial: 3. Best value: 0.0769814:  70%|███████   | 14/20 [11:42<04:58, 49.80s/it]

Train wi info: {'gini': np.float64(0.9761964897401556), 'ess': np.float64(13.546089100669864), 'max_wi': np.float64(3041.3617586052055), 'min_wi': np.float64(1.7702509620638117e-20)}
actual reward: [0.08646409]
{'gini': np.float64(0.9659322366351057), 'ess': np.float64(359.72897979260716), 'max_wi': np.float64(64.48319484582538), 'min_wi': np.float64(1.7702509620638117e-20)}
Estimated reward: 0.079897
Cross-validated error: 0.004839
Final score CI (reward +- 2*error): [0.070218, 0.089576]
Standard error: 0.008116
Final t_dist CI (reward +- t_0.975*se_hat): [0.063988, 0.095806]
[I 2025-11-02 21:46:13,250] Trial 13 finished with value: 0.0702181648258626 and parameters: {'lr': 0.044298028704981174, 'num_epochs': 2, 'batch_size': 64, 'num_neighbors': 6, 'lr_decay': 0.8866695591834365}. Best is trial 3 with value: 0.07698138827524716.

Trial 14 started


Best trial: 14. Best value: 0.0770537:  75%|███████▌  | 15/20 [12:33<04:10, 50.12s/it]

Train wi info: {'gini': np.float64(0.9307201777995682), 'ess': np.float64(449.8791742359363), 'max_wi': np.float64(235.00621251732156), 'min_wi': np.float64(2.6842192899385594e-05)}
actual reward: [0.08804588]
{'gini': np.float64(0.9270016840980667), 'ess': np.float64(328.60044084463806), 'max_wi': np.float64(220.7806938975361), 'min_wi': np.float64(2.6842192899385594e-05)}
Estimated reward: 0.087657
Cross-validated error: 0.005302
Final score CI (reward +- 2*error): [0.077054, 0.098260]
Standard error: 0.016322
Final t_dist CI (reward +- t_0.975*se_hat): [0.055663, 0.119651]
[I 2025-11-02 21:47:04,116] Trial 14 finished with value: 0.07705365655814808 and parameters: {'lr': 0.008194797415104297, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.8366680141396603}. Best is trial 14 with value: 0.07705365655814808.

Trial 15 started


Best trial: 14. Best value: 0.0770537:  80%|████████  | 16/20 [13:25<03:22, 50.65s/it]

Train wi info: {'gini': np.float64(0.4072217486596382), 'ess': np.float64(7877.800839527366), 'max_wi': np.float64(5.355320333749217), 'min_wi': np.float64(0.13470259022229997)}
actual reward: [0.08479021]
{'gini': np.float64(0.407257579578481), 'ess': np.float64(5342.291179282974), 'max_wi': np.float64(5.689683297699452), 'min_wi': np.float64(0.1406717716346172)}
Estimated reward: 0.077989
Cross-validated error: 0.001149
Final score CI (reward +- 2*error): [0.075691, 0.080288]
Standard error: 0.003807
Final t_dist CI (reward +- t_0.975*se_hat): [0.070526, 0.085452]
[I 2025-11-02 21:47:55,985] Trial 15 finished with value: 0.07569103952394328 and parameters: {'lr': 0.0010735123990010238, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 12, 'lr_decay': 0.8188143722781407}. Best is trial 14 with value: 0.07705365655814808.

Trial 16 started


Best trial: 14. Best value: 0.0770537:  85%|████████▌ | 17/20 [14:17<02:32, 50.99s/it]

Train wi info: {'gini': np.float64(0.6593681257561729), 'ess': np.float64(3402.208551760961), 'max_wi': np.float64(11.020995404500486), 'min_wi': np.float64(0.021840679018222696)}
actual reward: [0.08349727]
{'gini': np.float64(0.6580374366626692), 'ess': np.float64(2362.3168303590055), 'max_wi': np.float64(14.12188846316629), 'min_wi': np.float64(0.022030879361332044)}
Estimated reward: 0.077999
Cross-validated error: 0.001593
Final score CI (reward +- 2*error): [0.074813, 0.081184]
Standard error: 0.005725
Final t_dist CI (reward +- t_0.975*se_hat): [0.066777, 0.089220]
[I 2025-11-02 21:48:47,772] Trial 16 finished with value: 0.07481342395683073 and parameters: {'lr': 0.0017701718319250795, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.8683828861436919}. Best is trial 14 with value: 0.07705365655814808.

Trial 17 started


Best trial: 14. Best value: 0.0770537:  90%|█████████ | 18/20 [15:11<01:43, 51.93s/it]

Train wi info: {'gini': np.float64(0.9862479206887896), 'ess': np.float64(51.901839255042574), 'max_wi': np.float64(644.272391629149), 'min_wi': np.float64(4.275656418950523e-12)}
actual reward: [0.09001216]
{'gini': np.float64(0.9932126286322399), 'ess': np.float64(16.35954588229248), 'max_wi': np.float64(1835.3523990270892), 'min_wi': np.float64(4.275656418950523e-12)}
Estimated reward: 0.096064
Cross-validated error: 0.009928
Final score CI (reward +- 2*error): [0.076208, 0.115919]
Standard error: 0.025018
Final t_dist CI (reward +- t_0.975*se_hat): [0.047023, 0.145104]
[I 2025-11-02 21:49:41,898] Trial 17 finished with value: 0.07620805477696152 and parameters: {'lr': 0.009610202112412924, 'num_epochs': 8, 'batch_size': 64, 'num_neighbors': 15, 'lr_decay': 0.826499433817364}. Best is trial 14 with value: 0.07705365655814808.

Trial 18 started


Best trial: 14. Best value: 0.0770537:  95%|█████████▌| 19/20 [16:02<00:51, 51.52s/it]

Train wi info: {'gini': np.float64(0.9880517063091835), 'ess': np.float64(75.47153963179706), 'max_wi': np.float64(413.98828557969875), 'min_wi': np.float64(3.341185367083889e-09)}
actual reward: [0.0861028]
{'gini': np.float64(0.9880013893001773), 'ess': np.float64(61.25499798451042), 'max_wi': np.float64(287.7370713369324), 'min_wi': np.float64(3.341185367083889e-09)}
Estimated reward: 0.078673
Cross-validated error: 0.007799
Final score CI (reward +- 2*error): [0.063076, 0.094270]
Standard error: 0.013554
Final t_dist CI (reward +- t_0.975*se_hat): [0.052104, 0.105242]
[I 2025-11-02 21:50:32,472] Trial 18 finished with value: 0.06307571039476496 and parameters: {'lr': 0.006830366072451752, 'num_epochs': 5, 'batch_size': 128, 'num_neighbors': 9, 'lr_decay': 0.9408773811114716}. Best is trial 14 with value: 0.07705365655814808.

Trial 19 started


Best trial: 14. Best value: 0.0770537: 100%|██████████| 20/20 [16:52<00:00, 50.63s/it]

Train wi info: {'gini': np.float64(0.9991137711877944), 'ess': np.float64(9.941047089300197), 'max_wi': np.float64(1922.7868040482715), 'min_wi': np.float64(4.373249415554434e-15)}
actual reward: [0.0820328]
{'gini': np.float64(0.9993146384114582), 'ess': np.float64(5.4865715573957266), 'max_wi': np.float64(7035.41729969867), 'min_wi': np.float64(5.768955090271751e-15)}
Estimated reward: 0.011054
Cross-validated error: 0.002692
Final score CI (reward +- 2*error): [0.005671, 0.016437]
Standard error: 0.024755
Final t_dist CI (reward +- t_0.975*se_hat): [-0.037470, 0.059578]
[I 2025-11-02 21:51:23,098] Trial 19 finished with value: 0.0056705500254202994 and parameters: {'lr': 0.04922187681474791, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 12, 'lr_decay': 0.863423725647143}. Best is trial 14 with value: 0.07705365655814808.





Num samples is 10000
{'gini': np.float64(0.9931155080650808), 'ess': np.float64(34.09260205722292), 'max_wi': np.float64(756.3882061438497), 'min_wi': np.float64(2.758324709017361e-09)}
Eval time: 0.1286313533782959 seconds
Evaluation total results time: 0.4351184368133545 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08557719,0.0866,0.08650364,0.08909426,0.08686976,0.08686976,0.82618217,0.0,0.99950468,0.0
15000,0.08668432,0.06059182,0.08585125,0.10038258,0.07450582,0.07469615,0.96631088,0.37342811,1.10425372,0.21374903


In [9]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=20000)

Random Item CTR: 0.07042251854546815
Optimal greedy CTR: 0.09999934264692525
Second Best greedy CTR: 0.09938443255799592
Optimal Stochastic CTR: 0.09996075464321043
second Best Stochastic CTR: 0.08632684639469405
Our Initial CTR: 0.08647580588501355


In [10]:
# Run the optimization
df6, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df6[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.0241241455078125 seconds
Baseline regression model fit time: 0.08198142051696777 seconds
Baseline neighborhood model fit time: 25.81819438934326 seconds
Num samples is 10000
{'gini': np.float64(0.4768924046597257), 'ess': np.float64(4272.9908177404395), 'max_wi': np.float64(20.503708318829123), 'min_wi': np.float64(0.008806514039618792)}
Eval time: 0.1249399185180664 seconds


[I 2025-11-02 21:52:42,608] A new study created in memory with name: no-name-27b11f15-9667-4b35-9646-1a0f881b1380


Evaluation total results time: 0.42638134956359863 seconds
Simulation time for 25000 samples: 0.05681633949279785 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: -0.0137531:   5%|▌         | 1/20 [00:40<12:54, 40.77s/it]

Train wi info: {'gini': np.float64(0.9872776902829662), 'ess': np.float64(207.8056304368554), 'max_wi': np.float64(251.05881426478646), 'min_wi': np.float64(0.0)}
actual reward: [0.08599232]
{'gini': np.float64(0.9946432011884521), 'ess': np.float64(34.350357309740915), 'max_wi': np.float64(1251.9795457044888), 'min_wi': np.float64(0.0)}
Estimated reward: 0.008164
Cross-validated error: 0.010958
Final score CI (reward +- 2*error): [-0.013753, 0.030081]
Standard error: 0.038005
Final t_dist CI (reward +- t_0.975*se_hat): [-0.066333, 0.082661]
[I 2025-11-02 21:53:23,382] Trial 0 finished with value: -0.01375312958728705 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: -0.01375312958728705.

Trial 1 started


Best trial: 1. Best value: 0.0734966:  10%|█         | 2/20 [01:24<12:42, 42.35s/it] 

Train wi info: {'gini': np.float64(0.15120224698802953), 'ess': np.float64(13878.171425737793), 'max_wi': np.float64(2.4301218898783072), 'min_wi': np.float64(0.4613917346872692)}
actual reward: [0.08681025]
{'gini': np.float64(0.14392189823316415), 'ess': np.float64(9325.375733517149), 'max_wi': np.float64(2.4301218898783072), 'min_wi': np.float64(0.44886885503343055)}
Estimated reward: 0.075388
Cross-validated error: 0.000945
Final score CI (reward +- 2*error): [0.073497, 0.077279]
Standard error: 0.003020
Final t_dist CI (reward +- t_0.975*se_hat): [0.069467, 0.081308]
[I 2025-11-02 21:54:06,837] Trial 1 finished with value: 0.07349660945824209 and parameters: {'lr': 0.0003045362013114571, 'num_epochs': 6, 'batch_size': 128, 'num_neighbors': 5, 'lr_decay': 0.9325464059811139}. Best is trial 1 with value: 0.07349660945824209.

Trial 2 started


Best trial: 1. Best value: 0.0734966:  15%|█▌        | 3/20 [02:17<13:25, 47.40s/it]

Train wi info: {'gini': np.float64(0.9851103779853466), 'ess': np.float64(242.7269407800724), 'max_wi': np.float64(258.4109893277948), 'min_wi': np.float64(0.0)}
actual reward: [0.08259217]
{'gini': np.float64(0.9916231843045875), 'ess': np.float64(72.88795621459231), 'max_wi': np.float64(518.4628279516306), 'min_wi': np.float64(0.0)}
Estimated reward: 0.083251
Cross-validated error: 0.009752
Final score CI (reward +- 2*error): [0.063747, 0.102755]
Standard error: 0.030997
Final t_dist CI (reward +- t_0.975*se_hat): [0.022491, 0.144011]
[I 2025-11-02 21:55:00,252] Trial 2 finished with value: 0.0637470738390947 and parameters: {'lr': 0.09064503394821835, 'num_epochs': 6, 'batch_size': 64, 'num_neighbors': 6, 'lr_decay': 0.8649126532303534}. Best is trial 1 with value: 0.07349660945824209.

Trial 3 started


Best trial: 1. Best value: 0.0734966:  20%|██        | 4/20 [03:09<13:06, 49.14s/it]

Train wi info: {'gini': np.float64(0.3398451570930614), 'ess': np.float64(9875.720270187174), 'max_wi': np.float64(6.891912306613285), 'min_wi': np.float64(0.1521448955072331)}
actual reward: [0.08726173]
{'gini': np.float64(0.32298215368980654), 'ess': np.float64(6902.574125850877), 'max_wi': np.float64(6.891912306613285), 'min_wi': np.float64(0.136524547725294)}
Estimated reward: 0.074087
Cross-validated error: 0.001073
Final score CI (reward +- 2*error): [0.071941, 0.076234]
Standard error: 0.003260
Final t_dist CI (reward +- t_0.975*se_hat): [0.067697, 0.080478]
[I 2025-11-02 21:55:52,054] Trial 3 finished with value: 0.07194144028910689 and parameters: {'lr': 0.000731920292401472, 'num_epochs': 4, 'batch_size': 64, 'num_neighbors': 14, 'lr_decay': 0.8886668091588624}. Best is trial 1 with value: 0.07349660945824209.

Trial 4 started


Best trial: 1. Best value: 0.0734966:  25%|██▌       | 5/20 [04:00<12:25, 49.71s/it]

Train wi info: {'gini': np.float64(0.44236850380374526), 'ess': np.float64(7457.954241841094), 'max_wi': np.float64(10.349280351511252), 'min_wi': np.float64(0.09043553182387948)}
actual reward: [0.0875711]
{'gini': np.float64(0.41996534068278046), 'ess': np.float64(5401.702501591517), 'max_wi': np.float64(10.349280351511252), 'min_wi': np.float64(0.0742951429518546)}
Estimated reward: 0.072353
Cross-validated error: 0.001143
Final score CI (reward +- 2*error): [0.070067, 0.074638]
Standard error: 0.003961
Final t_dist CI (reward +- t_0.975*se_hat): [0.064589, 0.080117]
[I 2025-11-02 21:56:42,786] Trial 4 finished with value: 0.0700672831164115 and parameters: {'lr': 0.006013281797335775, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 5, 'lr_decay': 0.8092574479513778}. Best is trial 1 with value: 0.07349660945824209.

Trial 5 started


Best trial: 5. Best value: 0.0750294:  30%|███       | 6/20 [04:51<11:42, 50.21s/it]

Train wi info: {'gini': np.float64(0.051035807833184524), 'ess': np.float64(14872.605682154246), 'max_wi': np.float64(1.3552432470091427), 'min_wi': np.float64(0.7815325770430255)}
actual reward: [0.08658467]
{'gini': np.float64(0.048769473877843), 'ess': np.float64(9922.848986217405), 'max_wi': np.float64(1.3552432470091427), 'min_wi': np.float64(0.7943577321873003)}
Estimated reward: 0.076630
Cross-validated error: 0.000800
Final score CI (reward +- 2*error): [0.075029, 0.078230]
Standard error: 0.002894
Final t_dist CI (reward +- t_0.975*se_hat): [0.070956, 0.082303]
[I 2025-11-02 21:57:33,966] Trial 5 finished with value: 0.07502935898331826 and parameters: {'lr': 0.00039731463238558965, 'num_epochs': 2, 'batch_size': 256, 'num_neighbors': 6, 'lr_decay': 0.9924839563031487}. Best is trial 5 with value: 0.07502935898331826.

Trial 6 started


Best trial: 5. Best value: 0.0750294:  35%|███▌      | 7/20 [05:43<10:59, 50.70s/it]

Train wi info: {'gini': np.float64(0.17252158047739427), 'ess': np.float64(13553.61602147084), 'max_wi': np.float64(2.763062600076801), 'min_wi': np.float64(0.4085611408353566)}
actual reward: [0.08687096]
{'gini': np.float64(0.16384581814145677), 'ess': np.float64(9134.26826463974), 'max_wi': np.float64(2.763062600076801), 'min_wi': np.float64(0.426173273401828)}
Estimated reward: 0.075642
Cross-validated error: 0.001049
Final score CI (reward +- 2*error): [0.073544, 0.077740]
Standard error: 0.002914
Final t_dist CI (reward +- t_0.975*se_hat): [0.069929, 0.081355]
[I 2025-11-02 21:58:25,682] Trial 6 finished with value: 0.07354351232363704 and parameters: {'lr': 0.00023576414725853017, 'num_epochs': 6, 'batch_size': 64, 'num_neighbors': 9, 'lr_decay': 0.9475967312442687}. Best is trial 5 with value: 0.07502935898331826.

Trial 7 started


Best trial: 5. Best value: 0.0750294:  40%|████      | 8/20 [06:34<10:11, 50.93s/it]

Train wi info: {'gini': np.float64(0.6004355060636193), 'ess': np.float64(3656.0859576098796), 'max_wi': np.float64(25.677591963976976), 'min_wi': np.float64(0.0200030175954902)}
actual reward: [0.08803418]
{'gini': np.float64(0.569851822944156), 'ess': np.float64(2956.5024908536643), 'max_wi': np.float64(25.677591963976976), 'min_wi': np.float64(0.016014138445824635)}
Estimated reward: 0.069218
Cross-validated error: 0.001642
Final score CI (reward +- 2*error): [0.065934, 0.072501]
Standard error: 0.004847
Final t_dist CI (reward +- t_0.975*se_hat): [0.059717, 0.078718]
[I 2025-11-02 21:59:17,087] Trial 7 finished with value: 0.06593449524016826 and parameters: {'lr': 0.0009434778481879289, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.9798277678195761}. Best is trial 5 with value: 0.07502935898331826.

Trial 8 started


Best trial: 5. Best value: 0.0750294:  45%|████▌     | 9/20 [07:25<09:20, 50.99s/it]

Train wi info: {'gini': np.float64(0.520247765738695), 'ess': np.float64(5511.519782748848), 'max_wi': np.float64(15.137872723246037), 'min_wi': np.float64(0.043394603980027896)}
actual reward: [0.08776466]
{'gini': np.float64(0.4942704720379706), 'ess': np.float64(4166.217670107148), 'max_wi': np.float64(15.137872723246037), 'min_wi': np.float64(0.02978738143375777)}
Estimated reward: 0.072901
Cross-validated error: 0.001362
Final score CI (reward +- 2*error): [0.070177, 0.075625]
Standard error: 0.004596
Final t_dist CI (reward +- t_0.975*se_hat): [0.063891, 0.081911]
[I 2025-11-02 22:00:08,222] Trial 8 finished with value: 0.07017675390469097 and parameters: {'lr': 0.0008416613494500933, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 4, 'lr_decay': 0.977478258620087}. Best is trial 5 with value: 0.07502935898331826.

Trial 9 started


Best trial: 9. Best value: 0.0752703:  50%|█████     | 10/20 [08:17<08:33, 51.30s/it]

Train wi info: {'gini': np.float64(0.03103191629686257), 'ess': np.float64(14953.241841961157), 'max_wi': np.float64(1.1926517911973449), 'min_wi': np.float64(0.8575234917790638)}
actual reward: [0.08654243]
{'gini': np.float64(0.029596279119522933), 'ess': np.float64(9971.761762650649), 'max_wi': np.float64(1.1926517911973449), 'min_wi': np.float64(0.8725430419801092)}
Estimated reward: 0.077023
Cross-validated error: 0.000876
Final score CI (reward +- 2*error): [0.075270, 0.078776]
Standard error: 0.003118
Final t_dist CI (reward +- t_0.975*se_hat): [0.070910, 0.083136]
[I 2025-11-02 22:01:00,219] Trial 9 finished with value: 0.07527034086424444 and parameters: {'lr': 0.00011269631289699925, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.8130723706578883}. Best is trial 9 with value: 0.07527034086424444.

Trial 10 started


Best trial: 9. Best value: 0.0752703:  55%|█████▌    | 11/20 [09:09<07:44, 51.58s/it]

Train wi info: {'gini': np.float64(0.031125166716326168), 'ess': np.float64(14952.79771554345), 'max_wi': np.float64(1.2119969483467068), 'min_wi': np.float64(0.8638771070137926)}
actual reward: [0.08654199]
{'gini': np.float64(0.0296791164676478), 'ess': np.float64(9971.531354902154), 'max_wi': np.float64(1.2119969483467068), 'min_wi': np.float64(0.8674124149038795)}
Estimated reward: 0.076604
Cross-validated error: 0.000814
Final score CI (reward +- 2*error): [0.074975, 0.078232]
Standard error: 0.002781
Final t_dist CI (reward +- t_0.975*se_hat): [0.071152, 0.082055]
[I 2025-11-02 22:01:52,437] Trial 10 finished with value: 0.0749747535516183 and parameters: {'lr': 0.00010843918557668589, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 11, 'lr_decay': 0.8023682127114253}. Best is trial 9 with value: 0.07527034086424444.

Trial 11 started


Best trial: 9. Best value: 0.0752703:  60%|██████    | 12/20 [09:58<06:46, 50.83s/it]

Train wi info: {'gini': np.float64(0.3862133150570798), 'ess': np.float64(8801.220416510167), 'max_wi': np.float64(7.319585253994588), 'min_wi': np.float64(0.12344140267728011)}
actual reward: [0.08734521]
{'gini': np.float64(0.36715141168252974), 'ess': np.float64(6220.664630701786), 'max_wi': np.float64(7.319585253994588), 'min_wi': np.float64(0.11022590227008665)}
Estimated reward: 0.075193
Cross-validated error: 0.001040
Final score CI (reward +- 2*error): [0.073113, 0.077273]
Standard error: 0.003943
Final t_dist CI (reward +- t_0.975*se_hat): [0.067463, 0.082923]
[I 2025-11-02 22:02:41,561] Trial 11 finished with value: 0.07311331755076031 and parameters: {'lr': 0.005820259627809203, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.9285240095400468}. Best is trial 9 with value: 0.07527034086424444.

Trial 12 started


Best trial: 9. Best value: 0.0752703:  65%|██████▌   | 13/20 [10:49<05:55, 50.74s/it]

Train wi info: {'gini': np.float64(0.034456892664665846), 'ess': np.float64(14942.209025737344), 'max_wi': np.float64(1.2171746239432197), 'min_wi': np.float64(0.8479089403661345)}
actual reward: [0.08654964]
{'gini': np.float64(0.032844390624844906), 'ess': np.float64(9965.144371609562), 'max_wi': np.float64(1.2171746239432197), 'min_wi': np.float64(0.8583966469468761)}
Estimated reward: 0.076802
Cross-validated error: 0.000853
Final score CI (reward +- 2*error): [0.075096, 0.078507]
Standard error: 0.002827
Final t_dist CI (reward +- t_0.975*se_hat): [0.071259, 0.082344]
[I 2025-11-02 22:03:32,098] Trial 12 finished with value: 0.07509642967019285 and parameters: {'lr': 0.00010575674858820635, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 8, 'lr_decay': 0.8500962903782668}. Best is trial 9 with value: 0.07527034086424444.

Trial 13 started


Best trial: 9. Best value: 0.0752703:  70%|███████   | 14/20 [11:33<04:51, 48.65s/it]

Train wi info: {'gini': np.float64(0.04086588415005579), 'ess': np.float64(14918.542931140784), 'max_wi': np.float64(1.2659141981462738), 'min_wi': np.float64(0.8228625796467184)}
actual reward: [0.08656452]
{'gini': np.float64(0.03893203182058425), 'ess': np.float64(9950.958123815493), 'max_wi': np.float64(1.2659141981462738), 'min_wi': np.float64(0.8306430115175023)}
Estimated reward: 0.076549
Cross-validated error: 0.000911
Final score CI (reward +- 2*error): [0.074726, 0.078372]
Standard error: 0.002783
Final t_dist CI (reward +- t_0.975*se_hat): [0.071094, 0.082004]
[I 2025-11-02 22:04:15,916] Trial 13 finished with value: 0.07472582265254801 and parameters: {'lr': 0.00012992697536654655, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 11, 'lr_decay': 0.8379670639609008}. Best is trial 9 with value: 0.07527034086424444.

Trial 14 started


Best trial: 9. Best value: 0.0752703:  75%|███████▌  | 15/20 [12:23<04:05, 49.17s/it]

Train wi info: {'gini': np.float64(0.575932189277524), 'ess': np.float64(4155.02168995155), 'max_wi': np.float64(22.58354023705615), 'min_wi': np.float64(0.028304506356949848)}
actual reward: [0.08794336]
{'gini': np.float64(0.5464598554731104), 'ess': np.float64(3284.3952710081444), 'max_wi': np.float64(22.58354023705615), 'min_wi': np.float64(0.02178823999724639)}
Estimated reward: 0.068904
Cross-validated error: 0.001681
Final score CI (reward +- 2*error): [0.065542, 0.072265]
Standard error: 0.004760
Final t_dist CI (reward +- t_0.975*se_hat): [0.059573, 0.078235]
[I 2025-11-02 22:05:06,273] Trial 14 finished with value: 0.06554242061670744 and parameters: {'lr': 0.00217082958296085, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.828743460121567}. Best is trial 9 with value: 0.07527034086424444.

Trial 15 started


Best trial: 9. Best value: 0.0752703:  80%|████████  | 16/20 [13:13<03:17, 49.44s/it]

Train wi info: {'gini': np.float64(0.9918042422225802), 'ess': np.float64(23.736435175196778), 'max_wi': np.float64(2263.676497729847), 'min_wi': np.float64(3.2254322246056033e-10)}
actual reward: [0.08466198]
{'gini': np.float64(0.9903478383395476), 'ess': np.float64(73.65426830935213), 'max_wi': np.float64(333.2170854267618), 'min_wi': np.float64(3.782882651822962e-09)}
Estimated reward: 0.068032
Cross-validated error: 0.005727
Final score CI (reward +- 2*error): [0.056579, 0.079485]
Standard error: 0.012409
Final t_dist CI (reward +- t_0.975*se_hat): [0.043708, 0.092356]
[I 2025-11-02 22:05:56,360] Trial 15 finished with value: 0.05657868094344207 and parameters: {'lr': 0.02938423918860678, 'num_epochs': 8, 'batch_size': 512, 'num_neighbors': 11, 'lr_decay': 0.8770942116034263}. Best is trial 9 with value: 0.07527034086424444.

Trial 16 started


Best trial: 9. Best value: 0.0752703:  85%|████████▌ | 17/20 [14:04<02:29, 49.90s/it]

Train wi info: {'gini': np.float64(0.03338849371599091), 'ess': np.float64(14945.629904861675), 'max_wi': np.float64(1.2161692018784247), 'min_wi': np.float64(0.8504218656968617)}
actual reward: [0.08654657]
{'gini': np.float64(0.03180133165755951), 'ess': np.float64(9967.25470997719), 'max_wi': np.float64(1.2161692018784247), 'min_wi': np.float64(0.8583485065210654)}
Estimated reward: 0.077031
Cross-validated error: 0.000983
Final score CI (reward +- 2*error): [0.075064, 0.078998]
Standard error: 0.003119
Final t_dist CI (reward +- t_0.975*se_hat): [0.070917, 0.083145]
[I 2025-11-02 22:06:47,307] Trial 16 finished with value: 0.07506443946329701 and parameters: {'lr': 0.0001055484097291783, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.8280938994445377}. Best is trial 9 with value: 0.07527034086424444.

Trial 17 started


Best trial: 9. Best value: 0.0752703:  90%|█████████ | 18/20 [14:55<01:40, 50.29s/it]

Train wi info: {'gini': np.float64(0.4907023908738779), 'ess': np.float64(5988.785494736139), 'max_wi': np.float64(15.361490174463148), 'min_wi': np.float64(0.05645799244083824)}
actual reward: [0.0876926]
{'gini': np.float64(0.4651837211557492), 'ess': np.float64(4508.614537120008), 'max_wi': np.float64(15.361490174463148), 'min_wi': np.float64(0.042438918786138514)}
Estimated reward: 0.072252
Cross-validated error: 0.001315
Final score CI (reward +- 2*error): [0.069622, 0.074882]
Standard error: 0.004083
Final t_dist CI (reward +- t_0.975*se_hat): [0.064248, 0.080256]
[I 2025-11-02 22:07:38,503] Trial 17 finished with value: 0.06962193319453018 and parameters: {'lr': 0.0015442861762715973, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 9, 'lr_decay': 0.9015759910584403}. Best is trial 9 with value: 0.07527034086424444.

Trial 18 started


Best trial: 9. Best value: 0.0752703:  95%|█████████▌| 19/20 [15:46<00:50, 50.46s/it]

Train wi info: {'gini': np.float64(0.08662460758596519), 'ess': np.float64(14631.636290668011), 'max_wi': np.float64(1.6440226130681312), 'min_wi': np.float64(0.6509547233844817)}
actual reward: [0.08666602]
{'gini': np.float64(0.0824681587850956), 'ess': np.float64(9778.51053340677), 'max_wi': np.float64(1.6440226130681312), 'min_wi': np.float64(0.6726316284171544)}
Estimated reward: 0.075997
Cross-validated error: 0.000795
Final score CI (reward +- 2*error): [0.074406, 0.077587]
Standard error: 0.002878
Final t_dist CI (reward +- t_0.975*se_hat): [0.070355, 0.081639]
[I 2025-11-02 22:08:29,355] Trial 18 finished with value: 0.0744060798066481 and parameters: {'lr': 0.00025589434366040563, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.8569639395793818}. Best is trial 9 with value: 0.07527034086424444.

Trial 19 started


Best trial: 9. Best value: 0.0752703: 100%|██████████| 20/20 [16:38<00:00, 49.95s/it]

Train wi info: {'gini': np.float64(0.9705187450561152), 'ess': np.float64(328.84510558226236), 'max_wi': np.float64(94.83859658763726), 'min_wi': np.float64(2.1708930197552057e-06)}
actual reward: [0.09179006]
{'gini': np.float64(0.9603075831133824), 'ess': np.float64(252.50512751695004), 'max_wi': np.float64(106.52646096458814), 'min_wi': np.float64(1.128372166231466e-06)}
Estimated reward: 0.069584
Cross-validated error: 0.004720
Final score CI (reward +- 2*error): [0.060144, 0.079023]
Standard error: 0.015913
Final t_dist CI (reward +- t_0.975*se_hat): [0.038392, 0.100775]
[I 2025-11-02 22:09:21,543] Trial 19 finished with value: 0.06014435308839608 and parameters: {'lr': 0.015414231825338109, 'num_epochs': 9, 'batch_size': 512, 'num_neighbors': 12, 'lr_decay': 0.8148879849976876}. Best is trial 9 with value: 0.07527034086424444.





Num samples is 10000
{'gini': np.float64(0.48388758204171756), 'ess': np.float64(3955.3919197282758), 'max_wi': np.float64(39.46599916552396), 'min_wi': np.float64(0.013699470010061249)}
Eval time: 0.13395237922668457 seconds
Evaluation total results time: 0.45945310592651367 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08647581,0.087,0.08702125,0.09039638,0.08857681,0.08857681,0.88083979,0.0,0.74725465,0.0
15000,0.08655371,0.07801639,0.08690984,0.08939873,0.07697208,0.07696609,0.88327901,0.00874626,0.74708647,0.00417677


In [11]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=30000)

Random Item CTR: 0.07069350185865088
Optimal greedy CTR: 0.09999918303816259
Second Best greedy CTR: 0.09988806364453348
Optimal Stochastic CTR: 0.0999509448932121
second Best Stochastic CTR: 0.0863879153702632
Our Initial CTR: 0.08653966603258505


In [None]:
# Run the optimization
df7, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df7[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.031176328659057617 seconds
Baseline regression model fit time: 0.0773158073425293 seconds
Baseline neighborhood model fit time: 28.948025465011597 seconds
Num samples is 10000
{'gini': np.float64(0.48828470612959685), 'ess': np.float64(4030.997503021888), 'max_wi': np.float64(26.91439561207676), 'min_wi': np.float64(0.012952797711514308)}
Eval time: 0.13135600090026855 seconds


[I 2025-11-02 22:10:48,484] A new study created in memory with name: no-name-e6f8a2c3-8e38-453c-a7c0-bd241be2de62


Evaluation total results time: 0.46826839447021484 seconds
Simulation time for 25000 samples: 0.05841255187988281 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.107449:   5%|▌         | 1/20 [00:45<14:33, 46.00s/it]

Train wi info: {'gini': np.float64(0.988980790854492), 'ess': np.float64(180.27680009282193), 'max_wi': np.float64(264.098491568967), 'min_wi': np.float64(0.0)}
actual reward: [0.08437358]
{'gini': np.float64(0.9946821726377266), 'ess': np.float64(40.70282750449641), 'max_wi': np.float64(931.3444355746129), 'min_wi': np.float64(0.0)}
Estimated reward: 0.132892
Cross-validated error: 0.012721
Final score CI (reward +- 2*error): [0.107449, 0.158334]
Standard error: 0.038127
Final t_dist CI (reward +- t_0.975*se_hat): [0.058155, 0.207628]
[I 2025-11-02 22:11:34,481] Trial 0 finished with value: 0.10744872990421701 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.10744872990421701.

Trial 1 started


Best trial: 0. Best value: 0.107449:  10%|█         | 2/20 [01:33<14:00, 46.69s/it]

Train wi info: {'gini': np.float64(0.8365404243901994), 'ess': np.float64(1128.874030946434), 'max_wi': np.float64(114.02907033449958), 'min_wi': np.float64(0.0009032113650532207)}
actual reward: [0.09018413]
{'gini': np.float64(0.8201184048298633), 'ess': np.float64(295.24002557159395), 'max_wi': np.float64(484.69982333574035), 'min_wi': np.float64(0.0009032113650532207)}
Estimated reward: 0.081052
Cross-validated error: 0.003223
Final score CI (reward +- 2*error): [0.074606, 0.087497]
Standard error: 0.012018
Final t_dist CI (reward +- t_0.975*se_hat): [0.057495, 0.104609]
[I 2025-11-02 22:12:21,661] Trial 1 finished with value: 0.07460615361326069 and parameters: {'lr': 0.010131288988147595, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 4, 'lr_decay': 0.8218244099936143}. Best is trial 0 with value: 0.10744872990421701.

Trial 2 started


In [None]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=40000)

Random Item CTR: 0.07053370144999074
Optimal greedy CTR: 0.09999936716169436
Second Best greedy CTR: 0.09676800930842865
Optimal Stochastic CTR: 0.09995563088920843
second Best Stochastic CTR: 0.08606322612964991
Our Initial CTR: 0.08622184481781218


In [None]:
# Run the optimization
df8, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df8[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.48297294790935064), 'ess': np.float64(4206.88542516398), 'max_wi': np.float64(26.68727224829075), 'min_wi': np.float64(0.014262757298096463)}


[I 2025-10-30 10:03:03,724] A new study created in memory with name: no-name-31bed130-c819-466f-b871-4219b0b0ee45
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0689699:   5%|▌         | 1/20 [00:42<13:23, 42.27s/it]

Train wi info: {'gini': np.float64(0.5626145283674258), 'ess': np.float64(866.5206756290892), 'max_wi': np.float64(221.1270920127721), 'min_wi': np.float64(1.582570511292046e-08)}
actual reward: [0.07098819]
{'gini': np.float64(0.626748524747854), 'ess': np.float64(309.31416297110025), 'max_wi': np.float64(306.5124985861054), 'min_wi': np.float64(4.203406264184544e-09)}
Estimated reward: 0.075058
Cross-validated error: 0.003044
Final score CI (reward +- 2*error): [0.068970, 0.081146]
Standard error: 0.002512
Final t_dist CI (reward +- t_0.975*se_hat): [0.070135, 0.079982]
[I 2025-10-30 10:03:45,988] Trial 0 finished with value: 0.06896991747601121 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.06896991747601121.

Trial 1 started


Best trial: 1. Best value: 0.0779331:  10%|█         | 2/20 [01:33<14:15, 47.53s/it]

Train wi info: {'gini': np.float64(0.018766340773414485), 'ess': np.float64(14982.282509149807), 'max_wi': np.float64(1.2069352774494837), 'min_wi': np.float64(0.8074356924703069)}
actual reward: [0.08620356]
{'gini': np.float64(0.01900316519284223), 'ess': np.float64(9988.162406995285), 'max_wi': np.float64(1.2268891534112225), 'min_wi': np.float64(0.8125276250682063)}
Estimated reward: 0.079787
Cross-validated error: 0.000927
Final score CI (reward +- 2*error): [0.077933, 0.081641]
Standard error: 0.002871
Final t_dist CI (reward +- t_0.975*se_hat): [0.074159, 0.085416]
[I 2025-10-30 10:04:37,197] Trial 1 finished with value: 0.07793305925902404 and parameters: {'lr': 0.0018339184123787446, 'num_epochs': 3, 'batch_size': 64, 'num_neighbors': 12, 'lr_decay': 0.9216182229315077}. Best is trial 1 with value: 0.07793305925902404.

Trial 2 started


Best trial: 1. Best value: 0.0779331:  15%|█▌        | 3/20 [02:22<13:42, 48.40s/it]

Train wi info: {'gini': np.float64(0.32375104665138843), 'ess': np.float64(5799.714703155753), 'max_wi': np.float64(119.1956982182383), 'min_wi': np.float64(1.1416325530196559e-06)}
actual reward: [0.08074397]
{'gini': np.float64(0.3430638233701673), 'ess': np.float64(6269.015165612613), 'max_wi': np.float64(15.039554954784048), 'min_wi': np.float64(5.304374968807524e-07)}
Estimated reward: 0.073047
Cross-validated error: 0.001079
Final score CI (reward +- 2*error): [0.070888, 0.075206]
Standard error: 0.002929
Final t_dist CI (reward +- t_0.975*se_hat): [0.067306, 0.078788]
[I 2025-10-30 10:05:26,631] Trial 2 finished with value: 0.07088820436915118 and parameters: {'lr': 0.0449282439183618, 'num_epochs': 2, 'batch_size': 128, 'num_neighbors': 7, 'lr_decay': 0.9172641129335655}. Best is trial 1 with value: 0.07793305925902404.

Trial 3 started


Best trial: 1. Best value: 0.0779331:  20%|██        | 4/20 [03:13<13:05, 49.07s/it]

Train wi info: {'gini': np.float64(0.011563312659372767), 'ess': np.float64(14993.212063330926), 'max_wi': np.float64(1.1142040913073572), 'min_wi': np.float64(0.830928094326997)}
actual reward: [0.08621564]
{'gini': np.float64(0.012148176026425966), 'ess': np.float64(9995.026667762475), 'max_wi': np.float64(1.114011532014882), 'min_wi': np.float64(0.8212038739208789)}
Estimated reward: 0.078228
Cross-validated error: 0.000855
Final score CI (reward +- 2*error): [0.076517, 0.079939]
Standard error: 0.003075
Final t_dist CI (reward +- t_0.975*se_hat): [0.072200, 0.084256]
[I 2025-10-30 10:06:16,729] Trial 3 finished with value: 0.07651744115656046 and parameters: {'lr': 0.0018877149141972736, 'num_epochs': 3, 'batch_size': 128, 'num_neighbors': 4, 'lr_decay': 0.809225007101649}. Best is trial 1 with value: 0.07793305925902404.

Trial 4 started


Best trial: 1. Best value: 0.0779331:  25%|██▌       | 5/20 [04:04<12:28, 49.92s/it]

Train wi info: {'gini': np.float64(0.0006222208921782702), 'ess': np.float64(14999.980866977814), 'max_wi': np.float64(1.0051049666629475), 'min_wi': np.float64(0.9940680281894182)}
actual reward: [0.08622227]
{'gini': np.float64(0.0006552235635649682), 'ess': np.float64(9999.985962194687), 'max_wi': np.float64(1.0065801422412468), 'min_wi': np.float64(0.9939350390130208)}
Estimated reward: 0.078244
Cross-validated error: 0.000728
Final score CI (reward +- 2*error): [0.076788, 0.079701]
Standard error: 0.002844
Final t_dist CI (reward +- t_0.975*se_hat): [0.072669, 0.083819]
[I 2025-10-30 10:07:08,168] Trial 4 finished with value: 0.07678789612382773 and parameters: {'lr': 0.00014814191662014246, 'num_epochs': 4, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.8078678072677036}. Best is trial 1 with value: 0.07793305925902404.

Trial 5 started


Best trial: 1. Best value: 0.0779331:  30%|███       | 6/20 [04:55<11:43, 50.27s/it]

Train wi info: {'gini': np.float64(0.012151502301542327), 'ess': np.float64(14992.6233781622), 'max_wi': np.float64(1.1464692374958272), 'min_wi': np.float64(0.8706424430144234)}
actual reward: [0.08620678]
{'gini': np.float64(0.012592470163084606), 'ess': np.float64(9994.757192461035), 'max_wi': np.float64(1.1210597339392145), 'min_wi': np.float64(0.8756869040073209)}
Estimated reward: 0.078737
Cross-validated error: 0.000835
Final score CI (reward +- 2*error): [0.077067, 0.080406]
Standard error: 0.002840
Final t_dist CI (reward +- t_0.975*se_hat): [0.073169, 0.084304]
[I 2025-10-30 10:07:59,120] Trial 5 finished with value: 0.07706744210318989 and parameters: {'lr': 0.0014491610922470572, 'num_epochs': 3, 'batch_size': 64, 'num_neighbors': 13, 'lr_decay': 0.8278407899863048}. Best is trial 1 with value: 0.07793305925902404.

Trial 6 started


Best trial: 1. Best value: 0.0779331:  35%|███▌      | 7/20 [05:47<10:59, 50.71s/it]

Train wi info: {'gini': np.float64(0.40695570352736987), 'ess': np.float64(3255.168195627654), 'max_wi': np.float64(132.4251112624082), 'min_wi': np.float64(5.005070496808405e-06)}
actual reward: [0.07439395]
{'gini': np.float64(0.45039479864510223), 'ess': np.float64(4085.735493854486), 'max_wi': np.float64(37.07828027151728), 'min_wi': np.float64(2.5599702605936796e-06)}
Estimated reward: 0.076120
Cross-validated error: 0.001544
Final score CI (reward +- 2*error): [0.073032, 0.079208]
Standard error: 0.003211
Final t_dist CI (reward +- t_0.975*se_hat): [0.069826, 0.082414]
[I 2025-10-30 10:08:50,728] Trial 6 finished with value: 0.0730318105653954 and parameters: {'lr': 0.06403116375904482, 'num_epochs': 5, 'batch_size': 128, 'num_neighbors': 5, 'lr_decay': 0.9500958312039237}. Best is trial 1 with value: 0.07793305925902404.

Trial 7 started


Best trial: 1. Best value: 0.0779331:  40%|████      | 8/20 [06:37<10:09, 50.77s/it]

Train wi info: {'gini': np.float64(0.39933346329196445), 'ess': np.float64(7397.9707588671345), 'max_wi': np.float64(35.862372699362), 'min_wi': np.float64(3.1544657966428245e-08)}
actual reward: [0.07773382]
{'gini': np.float64(0.44104817916861105), 'ess': np.float64(4437.693643338176), 'max_wi': np.float64(31.43531943609729), 'min_wi': np.float64(1.9713591455718733e-07)}
Estimated reward: 0.074148
Cross-validated error: 0.001277
Final score CI (reward +- 2*error): [0.071594, 0.076703]
Standard error: 0.002978
Final t_dist CI (reward +- t_0.975*se_hat): [0.068310, 0.079986]
[I 2025-10-30 10:09:41,617] Trial 7 finished with value: 0.07159395647726953 and parameters: {'lr': 0.04664894618595769, 'num_epochs': 2, 'batch_size': 64, 'num_neighbors': 15, 'lr_decay': 0.8266725453103071}. Best is trial 1 with value: 0.07793305925902404.

Trial 8 started


Best trial: 1. Best value: 0.0779331:  45%|████▌     | 9/20 [07:29<09:22, 51.14s/it]

Train wi info: {'gini': np.float64(0.0217850185539466), 'ess': np.float64(14972.345434671932), 'max_wi': np.float64(1.3861994951419545), 'min_wi': np.float64(0.5109598834018069)}
actual reward: [0.08615332]
{'gini': np.float64(0.024924158561347394), 'ess': np.float64(9976.075474114255), 'max_wi': np.float64(1.2406972675537988), 'min_wi': np.float64(0.5171485820908754)}
Estimated reward: 0.078788
Cross-validated error: 0.000961
Final score CI (reward +- 2*error): [0.076865, 0.080710]
Standard error: 0.002849
Final t_dist CI (reward +- t_0.975*se_hat): [0.073202, 0.084373]
[I 2025-10-30 10:10:33,569] Trial 8 finished with value: 0.07686497027009669 and parameters: {'lr': 0.0032008520474201504, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 14, 'lr_decay': 0.9598088419933585}. Best is trial 1 with value: 0.07793305925902404.

Trial 9 started


Best trial: 1. Best value: 0.0779331:  50%|█████     | 10/20 [08:20<08:28, 50.87s/it]

Train wi info: {'gini': np.float64(0.09586548054924315), 'ess': np.float64(14537.362407828445), 'max_wi': np.float64(2.8363202513715766), 'min_wi': np.float64(0.14228855074262123)}
actual reward: [0.08580252]
{'gini': np.float64(0.09734243785274227), 'ess': np.float64(9690.493371222676), 'max_wi': np.float64(2.247399517413616), 'min_wi': np.float64(0.19777471880401368)}
Estimated reward: 0.077474
Cross-validated error: 0.000824
Final score CI (reward +- 2*error): [0.075826, 0.079121]
Standard error: 0.002861
Final t_dist CI (reward +- t_0.975*se_hat): [0.071866, 0.083081]
[I 2025-10-30 10:11:23,844] Trial 9 finished with value: 0.07582636511786038 and parameters: {'lr': 0.021271523926813765, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 13, 'lr_decay': 0.8636822967440527}. Best is trial 1 with value: 0.07793305925902404.

Trial 10 started


Best trial: 1. Best value: 0.0779331:  55%|█████▌    | 11/20 [09:11<07:40, 51.17s/it]

Train wi info: {'gini': np.float64(0.0019627694257448687), 'ess': np.float64(14999.809611828372), 'max_wi': np.float64(1.020773561757272), 'min_wi': np.float64(0.9812329609566296)}
actual reward: [0.08622176]
{'gini': np.float64(0.0020287126826277987), 'ess': np.float64(9999.866847678757), 'max_wi': np.float64(1.0153706688954136), 'min_wi': np.float64(0.9818212939663759)}
Estimated reward: 0.077637
Cross-validated error: 0.000932
Final score CI (reward +- 2*error): [0.075773, 0.079501]
Standard error: 0.002880
Final t_dist CI (reward +- t_0.975*se_hat): [0.071992, 0.083282]
[I 2025-10-30 10:12:15,689] Trial 10 finished with value: 0.07577259151691519 and parameters: {'lr': 0.00033504309700883747, 'num_epochs': 8, 'batch_size': 256, 'num_neighbors': 10, 'lr_decay': 0.9964712394063868}. Best is trial 1 with value: 0.07793305925902404.

Trial 11 started


Best trial: 1. Best value: 0.0779331:  60%|██████    | 12/20 [10:02<06:48, 51.11s/it]

Train wi info: {'gini': np.float64(0.018772009026133643), 'ess': np.float64(14982.445983005202), 'max_wi': np.float64(1.1785836584967377), 'min_wi': np.float64(0.837031458981825)}
actual reward: [0.08622484]
{'gini': np.float64(0.0180209140464448), 'ess': np.float64(9989.460925285184), 'max_wi': np.float64(1.161966751297773), 'min_wi': np.float64(0.8566546474972128)}
Estimated reward: 0.079208
Cross-validated error: 0.000905
Final score CI (reward +- 2*error): [0.077397, 0.081019]
Standard error: 0.002871
Final t_dist CI (reward +- t_0.975*se_hat): [0.073580, 0.084835]
[I 2025-10-30 10:13:06,652] Trial 11 finished with value: 0.07739693345865614 and parameters: {'lr': 0.0014000935534255394, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 11, 'lr_decay': 0.8926129062252365}. Best is trial 1 with value: 0.07793305925902404.

Trial 12 started


Best trial: 1. Best value: 0.0779331:  65%|██████▌   | 13/20 [10:54<05:57, 51.13s/it]

Train wi info: {'gini': np.float64(0.10262833920792347), 'ess': np.float64(14474.476444571845), 'max_wi': np.float64(2.4144513537854753), 'min_wi': np.float64(0.28555150715592453)}
actual reward: [0.08599942]
{'gini': np.float64(0.09742599294954257), 'ess': np.float64(9689.426058444275), 'max_wi': np.float64(2.4380624197584977), 'min_wi': np.float64(0.2983470056902843)}
Estimated reward: 0.078493
Cross-validated error: 0.000903
Final score CI (reward +- 2*error): [0.076686, 0.080300]
Standard error: 0.002922
Final t_dist CI (reward +- t_0.975*se_hat): [0.072766, 0.084220]
[I 2025-10-30 10:13:57,823] Trial 12 finished with value: 0.07668647031591758 and parameters: {'lr': 0.0077601605769491416, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 10, 'lr_decay': 0.8951672541814738}. Best is trial 1 with value: 0.07793305925902404.

Trial 13 started


Best trial: 1. Best value: 0.0779331:  70%|███████   | 14/20 [11:48<05:12, 52.10s/it]

Train wi info: {'gini': np.float64(0.0026466025711986483), 'ess': np.float64(14999.636140652603), 'max_wi': np.float64(1.0328334249683202), 'min_wi': np.float64(0.9589782006094174)}
actual reward: [0.0862219]
{'gini': np.float64(0.003307434605092946), 'ess': np.float64(9999.629569526845), 'max_wi': np.float64(1.0307008997571439), 'min_wi': np.float64(0.9539960633267828)}
Estimated reward: 0.079300
Cross-validated error: 0.000794
Final score CI (reward +- 2*error): [0.077711, 0.080889]
Standard error: 0.002877
Final t_dist CI (reward +- t_0.975*se_hat): [0.073659, 0.084940]
[I 2025-10-30 10:14:52,180] Trial 13 finished with value: 0.07771065130905619 and parameters: {'lr': 0.0005227210268073445, 'num_epochs': 10, 'batch_size': 64, 'num_neighbors': 11, 'lr_decay': 0.8988040535510764}. Best is trial 1 with value: 0.07793305925902404.

Trial 14 started


Best trial: 14. Best value: 0.0779466:  75%|███████▌  | 15/20 [12:39<04:19, 51.91s/it]

Train wi info: {'gini': np.float64(0.0014595416933363967), 'ess': np.float64(14999.892521908634), 'max_wi': np.float64(1.0188983649464713), 'min_wi': np.float64(0.9748367082330842)}
actual reward: [0.0862227]
{'gini': np.float64(0.0016928516551467598), 'ess': np.float64(9999.904705145384), 'max_wi': np.float64(1.0174573157869282), 'min_wi': np.float64(0.9816564161186769)}
Estimated reward: 0.079387
Cross-validated error: 0.000720
Final score CI (reward +- 2*error): [0.077947, 0.080828]
Standard error: 0.002880
Final t_dist CI (reward +- t_0.975*se_hat): [0.073741, 0.085033]
[I 2025-10-30 10:15:43,654] Trial 14 finished with value: 0.07794662480323412 and parameters: {'lr': 0.00047362075403384926, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 11, 'lr_decay': 0.9279765929746707}. Best is trial 14 with value: 0.07794662480323412.

Trial 15 started


Best trial: 14. Best value: 0.0779466:  80%|████████  | 16/20 [13:30<03:26, 51.59s/it]

Train wi info: {'gini': np.float64(0.0026064096612951086), 'ess': np.float64(14999.661968784947), 'max_wi': np.float64(1.0346385918812864), 'min_wi': np.float64(0.9696524980337196)}
actual reward: [0.08621898]
{'gini': np.float64(0.0028485006122293266), 'ess': np.float64(9999.737930586973), 'max_wi': np.float64(1.0224725118111304), 'min_wi': np.float64(0.9730137107820415)}
Estimated reward: 0.078683
Cross-validated error: 0.000889
Final score CI (reward +- 2*error): [0.076906, 0.080460]
Standard error: 0.002944
Final t_dist CI (reward +- t_0.975*se_hat): [0.072913, 0.084453]
[I 2025-10-30 10:16:34,490] Trial 15 finished with value: 0.07690553315631501 and parameters: {'lr': 0.0006365287803347342, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 7, 'lr_decay': 0.9311423018676094}. Best is trial 14 with value: 0.07794662480323412.

Trial 16 started


Best trial: 14. Best value: 0.0779466:  85%|████████▌ | 17/20 [14:21<02:34, 51.37s/it]

Train wi info: {'gini': np.float64(0.0005369485713765359), 'ess': np.float64(14999.985778107408), 'max_wi': np.float64(1.005005348323797), 'min_wi': np.float64(0.9945239409279155)}
actual reward: [0.08622223]
{'gini': np.float64(0.0005740540838746651), 'ess': np.float64(9999.98926652536), 'max_wi': np.float64(1.0048694185759686), 'min_wi': np.float64(0.9945984195517271)}
Estimated reward: 0.079367
Cross-validated error: 0.000942
Final score CI (reward +- 2*error): [0.077482, 0.081252]
Standard error: 0.002866
Final t_dist CI (reward +- t_0.975*se_hat): [0.073748, 0.084985]
[I 2025-10-30 10:17:25,342] Trial 16 finished with value: 0.07748167382865498 and parameters: {'lr': 0.00013024500822024718, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 12, 'lr_decay': 0.9747602591616251}. Best is trial 14 with value: 0.07794662480323412.

Trial 17 started


Best trial: 14. Best value: 0.0779466:  90%|█████████ | 18/20 [15:12<01:42, 51.20s/it]

Train wi info: {'gini': np.float64(0.020527302557703537), 'ess': np.float64(14978.084466093875), 'max_wi': np.float64(1.20851713962907), 'min_wi': np.float64(0.655360585919064)}
actual reward: [0.08617091]
{'gini': np.float64(0.02317122595308122), 'ess': np.float64(9981.50562361515), 'max_wi': np.float64(1.1939141703560845), 'min_wi': np.float64(0.6474505997874405)}
Estimated reward: 0.077046
Cross-validated error: 0.000885
Final score CI (reward +- 2*error): [0.075275, 0.078817]
Standard error: 0.002887
Final t_dist CI (reward +- t_0.975*se_hat): [0.071387, 0.082705]
[I 2025-10-30 10:18:16,164] Trial 17 finished with value: 0.07527519640613758 and parameters: {'lr': 0.006071807939305066, 'num_epochs': 7, 'batch_size': 512, 'num_neighbors': 9, 'lr_decay': 0.9233462181577409}. Best is trial 14 with value: 0.07794662480323412.

Trial 18 started


Best trial: 14. Best value: 0.0779466:  95%|█████████▌| 19/20 [16:03<00:51, 51.02s/it]

Train wi info: {'gini': np.float64(0.0007888421843980483), 'ess': np.float64(14999.969367799215), 'max_wi': np.float64(1.0071680557999327), 'min_wi': np.float64(0.9926170286304189)}
actual reward: [0.08622169]
{'gini': np.float64(0.000857575285339545), 'ess': np.float64(9999.976165835476), 'max_wi': np.float64(1.0061927260081673), 'min_wi': np.float64(0.9924287481271912)}
Estimated reward: 0.078685
Cross-validated error: 0.000882
Final score CI (reward +- 2*error): [0.076921, 0.080449]
Standard error: 0.002837
Final t_dist CI (reward +- t_0.975*se_hat): [0.073124, 0.084245]
[I 2025-10-30 10:19:06,760] Trial 18 finished with value: 0.0769209633382369 and parameters: {'lr': 0.0002752642380955787, 'num_epochs': 6, 'batch_size': 256, 'num_neighbors': 15, 'lr_decay': 0.8732271743681641}. Best is trial 14 with value: 0.07794662480323412.

Trial 19 started


Best trial: 14. Best value: 0.0779466: 100%|██████████| 20/20 [16:50<00:00, 50.55s/it]

Train wi info: {'gini': np.float64(0.0034579803629473343), 'ess': np.float64(14999.40919634976), 'max_wi': np.float64(1.0320677621018406), 'min_wi': np.float64(0.9626869805921305)}
actual reward: [0.0862182]
{'gini': np.float64(0.003892809550713829), 'ess': np.float64(9999.503779316083), 'max_wi': np.float64(1.0294673562624606), 'min_wi': np.float64(0.967206852483284)}
Estimated reward: 0.078547
Cross-validated error: 0.000974
Final score CI (reward +- 2*error): [0.076600, 0.080494]
Standard error: 0.002874
Final t_dist CI (reward +- t_0.975*se_hat): [0.072913, 0.084182]
[I 2025-10-30 10:19:54,628] Trial 19 finished with value: 0.07660004628349852 and parameters: {'lr': 0.0008804230446990676, 'num_epochs': 9, 'batch_size': 256, 'num_neighbors': 11, 'lr_decay': 0.9411507255309078}. Best is trial 14 with value: 0.07794662480323412.





Num samples is 10000
{'gini': np.float64(0.4668149495753226), 'ess': np.float64(4672.823503627641), 'max_wi': np.float64(16.962707883902738), 'min_wi': np.float64(0.014015010024919842)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08622184,0.0892,0.08927464,0.09322973,0.08955775,0.08955775,0.92210476,0.0,0.83772226,0.0
15000,0.08622244,0.07799825,0.08640752,0.08976593,0.07853542,0.07853573,0.92212387,0.00159145,0.83767829,0.00168795


In [None]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=50000)

Random Item CTR: 0.0705882181025533
Optimal greedy CTR: 0.09999934164533562
Second Best greedy CTR: 0.09924496289352924
Optimal Stochastic CTR: 0.09995498601895662
second Best Stochastic CTR: 0.08629000824986369
Our Initial CTR: 0.08647501952799874


In [None]:
# Run the optimization
df9, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df9[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Num samples is 10000
{'gini': np.float64(0.48655661699878366), 'ess': np.float64(4239.612735801623), 'max_wi': np.float64(18.4166650696614), 'min_wi': np.float64(0.019148344088431003)}


[I 2025-10-30 10:21:07,302] A new study created in memory with name: no-name-24f4bb67-00af-4afd-b9a6-22d92411903a
  0%|          | 0/20 [00:00<?, ?it/s]


Trial 0 started


Best trial: 0. Best value: 0.0649131:   5%|▌         | 1/20 [00:40<12:48, 40.46s/it]

Train wi info: {'gini': np.float64(0.5758156274969405), 'ess': np.float64(582.4190650954737), 'max_wi': np.float64(278.28867653625656), 'min_wi': np.float64(6.081468907580241e-05)}
actual reward: [0.07089183]
{'gini': np.float64(0.6483447842288079), 'ess': np.float64(114.6669352290496), 'max_wi': np.float64(581.304013527663), 'min_wi': np.float64(0.00011427346244316298)}
Estimated reward: 0.070481
Cross-validated error: 0.002784
Final score CI (reward +- 2*error): [0.064913, 0.076050]
Standard error: 0.002412
Final t_dist CI (reward +- t_0.975*se_hat): [0.065754, 0.075209]
[I 2025-10-30 10:21:47,756] Trial 0 finished with value: 0.06491312135708051 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.06491312135708051.

Trial 1 started


Best trial: 1. Best value: 0.0774171:  10%|█         | 2/20 [01:24<12:45, 42.55s/it]

Train wi info: {'gini': np.float64(0.0014155532577596007), 'ess': np.float64(14999.900881040963), 'max_wi': np.float64(1.0137838130806598), 'min_wi': np.float64(0.9870197700691672)}
actual reward: [0.08647558]
{'gini': np.float64(0.001689587765181684), 'ess': np.float64(9999.906133898925), 'max_wi': np.float64(1.0218538196079832), 'min_wi': np.float64(0.9866714188162989)}
Estimated reward: 0.079189
Cross-validated error: 0.000886
Final score CI (reward +- 2*error): [0.077417, 0.080961]
Standard error: 0.003120
Final t_dist CI (reward +- t_0.975*se_hat): [0.073074, 0.085305]
[I 2025-10-30 10:22:31,779] Trial 1 finished with value: 0.07741714283561792 and parameters: {'lr': 0.000491251440654244, 'num_epochs': 10, 'batch_size': 256, 'num_neighbors': 4, 'lr_decay': 0.9220919932661721}. Best is trial 1 with value: 0.07741714283561792.

Trial 2 started


Best trial: 2. Best value: 0.0791962:  15%|█▌        | 3/20 [02:05<11:52, 41.92s/it]

Train wi info: {'gini': np.float64(0.007698173406693068), 'ess': np.float64(14996.983988644142), 'max_wi': np.float64(1.082571753103216), 'min_wi': np.float64(0.8990063295514424)}
actual reward: [0.08646463]
{'gini': np.float64(0.009236280540994636), 'ess': np.float64(9997.13329609779), 'max_wi': np.float64(1.0818110461319685), 'min_wi': np.float64(0.8799961256053783)}
Estimated reward: 0.080836
Cross-validated error: 0.000820
Final score CI (reward +- 2*error): [0.079196, 0.082477]
Standard error: 0.002930
Final t_dist CI (reward +- t_0.975*se_hat): [0.075094, 0.086579]
[I 2025-10-30 10:23:12,943] Trial 2 finished with value: 0.0791962370252024 and parameters: {'lr': 0.0016873700874667644, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8999672784116262}. Best is trial 2 with value: 0.0791962370252024.

Trial 3 started


Best trial: 2. Best value: 0.0791962:  20%|██        | 4/20 [02:55<12:01, 45.08s/it]

Train wi info: {'gini': np.float64(0.003843090528717715), 'ess': np.float64(14999.276370153977), 'max_wi': np.float64(1.0311499324634943), 'min_wi': np.float64(0.9656979957752422)}
actual reward: [0.08647506]
{'gini': np.float64(0.003965662205604547), 'ess': np.float64(9999.491545686162), 'max_wi': np.float64(1.0298739204279677), 'min_wi': np.float64(0.9681454136812789)}
Estimated reward: 0.078191
Cross-validated error: 0.000950
Final score CI (reward +- 2*error): [0.076292, 0.080090]
Standard error: 0.003064
Final t_dist CI (reward +- t_0.975*se_hat): [0.072184, 0.084198]
[I 2025-10-30 10:24:02,865] Trial 3 finished with value: 0.07629177262419132 and parameters: {'lr': 0.0006161740657775684, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 5, 'lr_decay': 0.9931771034141167}. Best is trial 2 with value: 0.0791962370252024.

Trial 4 started


Best trial: 2. Best value: 0.0791962:  25%|██▌       | 5/20 [03:45<11:41, 46.78s/it]

Train wi info: {'gini': np.float64(0.00022619083912245143), 'ess': np.float64(14999.997481421155), 'max_wi': np.float64(1.00210857517366), 'min_wi': np.float64(0.9977062518107375)}
actual reward: [0.08647539]
{'gini': np.float64(0.0002892467174024528), 'ess': np.float64(9999.997275084976), 'max_wi': np.float64(1.0024527840391315), 'min_wi': np.float64(0.9974200622050627)}
Estimated reward: 0.079554
Cross-validated error: 0.000949
Final score CI (reward +- 2*error): [0.077655, 0.081452]
Standard error: 0.002929
Final t_dist CI (reward +- t_0.975*se_hat): [0.073812, 0.085295]
[I 2025-10-30 10:24:52,668] Trial 4 finished with value: 0.07765546177915458 and parameters: {'lr': 0.0002310396707104062, 'num_epochs': 8, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.8183544076964042}. Best is trial 2 with value: 0.0791962370252024.

Trial 5 started


Best trial: 2. Best value: 0.0791962:  30%|███       | 6/20 [04:35<11:10, 47.92s/it]

Train wi info: {'gini': np.float64(0.03281281848089148), 'ess': np.float64(14903.201602329296), 'max_wi': np.float64(1.5754615638512335), 'min_wi': np.float64(0.04912627370426946)}
actual reward: [0.08598708]
{'gini': np.float64(0.05450500081226987), 'ess': np.float64(9874.874951244965), 'max_wi': np.float64(1.5716692075936933), 'min_wi': np.float64(0.030187536899554872)}
Estimated reward: 0.078586
Cross-validated error: 0.000967
Final score CI (reward +- 2*error): [0.076653, 0.080519]
Standard error: 0.002997
Final t_dist CI (reward +- t_0.975*se_hat): [0.072711, 0.084461]
[I 2025-10-30 10:25:42,785] Trial 5 finished with value: 0.0766526210132884 and parameters: {'lr': 0.024520480148203926, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.8067765878496744}. Best is trial 2 with value: 0.0791962370252024.

Trial 6 started


Best trial: 2. Best value: 0.0791962:  35%|███▌      | 7/20 [05:18<10:03, 46.41s/it]

Train wi info: {'gini': np.float64(0.06369988768024623), 'ess': np.float64(14790.426247465986), 'max_wi': np.float64(1.7326713618155203), 'min_wi': np.float64(0.17611687918779717)}
actual reward: [0.08612305]
{'gini': np.float64(0.06906853705468191), 'ess': np.float64(9839.03140066994), 'max_wi': np.float64(1.7481121863717228), 'min_wi': np.float64(0.12261051265537525)}
Estimated reward: 0.078528
Cross-validated error: 0.000770
Final score CI (reward +- 2*error): [0.076988, 0.080068]
Standard error: 0.002942
Final t_dist CI (reward +- t_0.975*se_hat): [0.072761, 0.084295]
[I 2025-10-30 10:26:26,094] Trial 6 finished with value: 0.07698837413890192 and parameters: {'lr': 0.014608711346120228, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 9, 'lr_decay': 0.9529268738561352}. Best is trial 2 with value: 0.0791962370252024.

Trial 7 started


Best trial: 2. Best value: 0.0791962:  40%|████      | 8/20 [06:08<09:28, 47.35s/it]

Train wi info: {'gini': np.float64(0.44054048698808174), 'ess': np.float64(2995.6807600320612), 'max_wi': np.float64(106.93973870308403), 'min_wi': np.float64(0.0005737130891935812)}
actual reward: [0.08201236]
{'gini': np.float64(0.4183623795451703), 'ess': np.float64(2886.1547492449336), 'max_wi': np.float64(106.11498487585041), 'min_wi': np.float64(0.0008577074577815674)}
Estimated reward: 0.073387
Cross-validated error: 0.001357
Final score CI (reward +- 2*error): [0.070673, 0.076100]
Standard error: 0.002966
Final t_dist CI (reward +- t_0.975*se_hat): [0.067574, 0.079200]
[I 2025-10-30 10:27:15,471] Trial 7 finished with value: 0.07067278333625031 and parameters: {'lr': 0.03874674842088091, 'num_epochs': 1, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.9608232173513035}. Best is trial 2 with value: 0.0791962370252024.

Trial 8 started


Best trial: 2. Best value: 0.0791962:  45%|████▌     | 9/20 [06:59<08:52, 48.45s/it]

Train wi info: {'gini': np.float64(0.11431720844537446), 'ess': np.float64(14271.072669915493), 'max_wi': np.float64(4.681452676155234), 'min_wi': np.float64(0.00037834576660828817)}
actual reward: [0.08401748]
{'gini': np.float64(0.16499316257365024), 'ess': np.float64(9096.25763866748), 'max_wi': np.float64(3.9256730411733973), 'min_wi': np.float64(0.0006381934983722461)}
Estimated reward: 0.075799
Cross-validated error: 0.000861
Final score CI (reward +- 2*error): [0.074077, 0.077522]
Standard error: 0.003076
Final t_dist CI (reward +- t_0.975*se_hat): [0.069770, 0.081829]
[I 2025-10-30 10:28:06,342] Trial 8 finished with value: 0.07407672083199793 and parameters: {'lr': 0.044242980694741044, 'num_epochs': 10, 'batch_size': 512, 'num_neighbors': 5, 'lr_decay': 0.9064610264347351}. Best is trial 2 with value: 0.0791962370252024.

Trial 9 started


Best trial: 2. Best value: 0.0791962:  50%|█████     | 10/20 [07:41<07:45, 46.55s/it]

Train wi info: {'gini': np.float64(0.29404769869237174), 'ess': np.float64(10790.957961439346), 'max_wi': np.float64(10.549487147501296), 'min_wi': np.float64(0.0006604007709820701)}
actual reward: [0.08236919]
{'gini': np.float64(0.3055429655197705), 'ess': np.float64(7158.519603455943), 'max_wi': np.float64(11.675602705072762), 'min_wi': np.float64(0.00013407708386028194)}
Estimated reward: 0.076237
Cross-validated error: 0.001062
Final score CI (reward +- 2*error): [0.074113, 0.078360]
Standard error: 0.003205
Final t_dist CI (reward +- t_0.975*se_hat): [0.069954, 0.082520]
[I 2025-10-30 10:28:48,619] Trial 9 finished with value: 0.07411317567428873 and parameters: {'lr': 0.07223663462898158, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 4, 'lr_decay': 0.8196091595748776}. Best is trial 2 with value: 0.0791962370252024.

Trial 10 started


Best trial: 2. Best value: 0.0791962:  55%|█████▌    | 11/20 [08:32<07:11, 47.95s/it]

Train wi info: {'gini': np.float64(0.026055723894313623), 'ess': np.float64(14965.277653764617), 'max_wi': np.float64(1.379156926269588), 'min_wi': np.float64(0.691954467718099)}
actual reward: [0.086399]
{'gini': np.float64(0.028095833670848624), 'ess': np.float64(9973.3005750225), 'max_wi': np.float64(1.310975997999493), 'min_wi': np.float64(0.6191759123916923)}
Estimated reward: 0.077752
Cross-validated error: 0.000890
Final score CI (reward +- 2*error): [0.075971, 0.079533]
Standard error: 0.002849
Final t_dist CI (reward +- t_0.975*se_hat): [0.072166, 0.083337]
[I 2025-10-30 10:29:39,735] Trial 10 finished with value: 0.07597068391944893 and parameters: {'lr': 0.004502719915588924, 'num_epochs': 4, 'batch_size': 128, 'num_neighbors': 15, 'lr_decay': 0.8622497012874704}. Best is trial 2 with value: 0.0791962370252024.

Trial 11 started


Best trial: 2. Best value: 0.0791962:  60%|██████    | 12/20 [09:24<06:34, 49.31s/it]

Train wi info: {'gini': np.float64(0.000288757457238771), 'ess': np.float64(14999.995920458176), 'max_wi': np.float64(1.0023861899911692), 'min_wi': np.float64(0.9973922758011496)}
actual reward: [0.08647592]
{'gini': np.float64(0.00033804967375848985), 'ess': np.float64(9999.99630880492), 'max_wi': np.float64(1.0029666576029908), 'min_wi': np.float64(0.9972222090094722)}
Estimated reward: 0.078962
Cross-validated error: 0.000798
Final score CI (reward +- 2*error): [0.077365, 0.080559]
Standard error: 0.002888
Final t_dist CI (reward +- t_0.975*se_hat): [0.073300, 0.084624]
[I 2025-10-30 10:30:32,165] Trial 11 finished with value: 0.07736547916195939 and parameters: {'lr': 0.000111247961464746, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 13, 'lr_decay': 0.8646667164986167}. Best is trial 2 with value: 0.0791962370252024.

Trial 12 started


Best trial: 2. Best value: 0.0791962:  65%|██████▌   | 13/20 [10:16<05:49, 49.90s/it]

Train wi info: {'gini': np.float64(0.002373572420186302), 'ess': np.float64(14999.720126281067), 'max_wi': np.float64(1.02881302294175), 'min_wi': np.float64(0.9764436492042774)}
actual reward: [0.0864741]
{'gini': np.float64(0.0028177415593053793), 'ess': np.float64(9999.73929687947), 'max_wi': np.float64(1.0234287085095342), 'min_wi': np.float64(0.9771797893900491)}
Estimated reward: 0.079707
Cross-validated error: 0.000780
Final score CI (reward +- 2*error): [0.078148, 0.081267]
Standard error: 0.002917
Final t_dist CI (reward +- t_0.975*se_hat): [0.073990, 0.085424]
[I 2025-10-30 10:31:23,434] Trial 12 finished with value: 0.07814762759137904 and parameters: {'lr': 0.0006236967228574693, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8875250612771154}. Best is trial 2 with value: 0.0791962370252024.

Trial 13 started


Best trial: 2. Best value: 0.0791962:  70%|███████   | 14/20 [11:07<05:02, 50.47s/it]

Train wi info: {'gini': np.float64(0.007152050404892443), 'ess': np.float64(14997.406351135403), 'max_wi': np.float64(1.081031132995209), 'min_wi': np.float64(0.907116158752719)}
actual reward: [0.0864664]
{'gini': np.float64(0.008905786109805174), 'ess': np.float64(9997.346116494507), 'max_wi': np.float64(1.0833632847324337), 'min_wi': np.float64(0.9002959915669818)}
Estimated reward: 0.078255
Cross-validated error: 0.000935
Final score CI (reward +- 2*error): [0.076385, 0.080124]
Standard error: 0.002894
Final t_dist CI (reward +- t_0.975*se_hat): [0.072581, 0.083928]
[I 2025-10-30 10:32:15,200] Trial 13 finished with value: 0.07638501791159794 and parameters: {'lr': 0.0018666649612382232, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8844937970784739}. Best is trial 2 with value: 0.0791962370252024.

Trial 14 started


Best trial: 2. Best value: 0.0791962:  75%|███████▌  | 15/20 [11:58<04:12, 50.59s/it]

Train wi info: {'gini': np.float64(0.01337564159673776), 'ess': np.float64(14990.624328323376), 'max_wi': np.float64(1.1322013181929627), 'min_wi': np.float64(0.7318067956640048)}
actual reward: [0.08645358]
{'gini': np.float64(0.014931368240227526), 'ess': np.float64(9992.29450376444), 'max_wi': np.float64(1.1491775120233731), 'min_wi': np.float64(0.7523495015675973)}
Estimated reward: 0.079723
Cross-validated error: 0.000847
Final score CI (reward +- 2*error): [0.078030, 0.081416]
Standard error: 0.002901
Final t_dist CI (reward +- t_0.975*se_hat): [0.074036, 0.085410]
[I 2025-10-30 10:33:06,087] Trial 14 finished with value: 0.07803002062874195 and parameters: {'lr': 0.0022740222403332022, 'num_epochs': 7, 'batch_size': 128, 'num_neighbors': 12, 'lr_decay': 0.9284165880500647}. Best is trial 2 with value: 0.0791962370252024.

Trial 15 started


Best trial: 2. Best value: 0.0791962:  80%|████████  | 16/20 [12:49<03:22, 50.66s/it]

Train wi info: {'gini': np.float64(0.003521057958608615), 'ess': np.float64(14999.375400626308), 'max_wi': np.float64(1.0362453771384552), 'min_wi': np.float64(0.9617707814312108)}
actual reward: [0.08647055]
{'gini': np.float64(0.004282853451232387), 'ess': np.float64(9999.387928544058), 'max_wi': np.float64(1.0375644845003984), 'min_wi': np.float64(0.959516734199)}
Estimated reward: 0.078898
Cross-validated error: 0.000886
Final score CI (reward +- 2*error): [0.077126, 0.080671]
Standard error: 0.002874
Final t_dist CI (reward +- t_0.975*se_hat): [0.073265, 0.084532]
[I 2025-10-30 10:33:56,911] Trial 15 finished with value: 0.07712586798995012 and parameters: {'lr': 0.0009456893416895951, 'num_epochs': 9, 'batch_size': 128, 'num_neighbors': 14, 'lr_decay': 0.8942367310656646}. Best is trial 2 with value: 0.0791962370252024.

Trial 16 started


Best trial: 2. Best value: 0.0791962:  85%|████████▌ | 17/20 [13:39<02:31, 50.52s/it]

Train wi info: {'gini': np.float64(0.03559151148696517), 'ess': np.float64(14933.649579800407), 'max_wi': np.float64(1.3459709576698657), 'min_wi': np.float64(0.45095626362802954)}
actual reward: [0.08633096]
{'gini': np.float64(0.040447662407329346), 'ess': np.float64(9943.650461669577), 'max_wi': np.float64(1.4584631458361672), 'min_wi': np.float64(0.43363375748657973)}
Estimated reward: 0.080022
Cross-validated error: 0.000866
Final score CI (reward +- 2*error): [0.078289, 0.081755]
Standard error: 0.002924
Final t_dist CI (reward +- t_0.975*se_hat): [0.074291, 0.085753]
[I 2025-10-30 10:34:47,098] Trial 16 finished with value: 0.07828880742490717 and parameters: {'lr': 0.0065011272022635885, 'num_epochs': 4, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8425506528077927}. Best is trial 2 with value: 0.0791962370252024.

Trial 17 started


Best trial: 2. Best value: 0.0791962:  90%|█████████ | 18/20 [14:29<01:40, 50.28s/it]

Train wi info: {'gini': np.float64(0.04981488642762307), 'ess': np.float64(14871.144065732577), 'max_wi': np.float64(1.5342547343671171), 'min_wi': np.float64(0.30641712836519813)}
actual reward: [0.08629092]
{'gini': np.float64(0.05297195779274059), 'ess': np.float64(9903.293504235953), 'max_wi': np.float64(1.610829370610391), 'min_wi': np.float64(0.19044854054406984)}
Estimated reward: 0.079906
Cross-validated error: 0.000818
Final score CI (reward +- 2*error): [0.078271, 0.081542]
Standard error: 0.002906
Final t_dist CI (reward +- t_0.975*se_hat): [0.074209, 0.085603]
[I 2025-10-30 10:35:36,831] Trial 17 finished with value: 0.07827053042191709 and parameters: {'lr': 0.007615098439450547, 'num_epochs': 3, 'batch_size': 128, 'num_neighbors': 12, 'lr_decay': 0.838717536469215}. Best is trial 2 with value: 0.0791962370252024.

Trial 18 started


Best trial: 2. Best value: 0.0791962:  95%|█████████▌| 19/20 [15:14<00:48, 48.58s/it]

Train wi info: {'gini': np.float64(0.04482226270354083), 'ess': np.float64(14882.65724769548), 'max_wi': np.float64(1.7095539190138052), 'min_wi': np.float64(0.2002279705282591)}
actual reward: [0.08620516]
{'gini': np.float64(0.05325322448194675), 'ess': np.float64(9894.692264109874), 'max_wi': np.float64(1.5562116188919466), 'min_wi': np.float64(0.15915356651274173)}
Estimated reward: 0.079254
Cross-validated error: 0.000804
Final score CI (reward +- 2*error): [0.077646, 0.080862]
Standard error: 0.002939
Final t_dist CI (reward +- t_0.975*se_hat): [0.073493, 0.085015]
[I 2025-10-30 10:36:21,459] Trial 18 finished with value: 0.07764581574889343 and parameters: {'lr': 0.009204464019853708, 'num_epochs': 5, 'batch_size': 128, 'num_neighbors': 10, 'lr_decay': 0.8370472583835912}. Best is trial 2 with value: 0.0791962370252024.

Trial 19 started


Best trial: 2. Best value: 0.0791962: 100%|██████████| 20/20 [16:03<00:00, 48.17s/it]

Train wi info: {'gini': np.float64(0.01518064812998182), 'ess': np.float64(14988.575244758265), 'max_wi': np.float64(1.1354345201967349), 'min_wi': np.float64(0.803377001140173)}
actual reward: [0.08644032]
{'gini': np.float64(0.016122229364670868), 'ess': np.float64(9991.468862125128), 'max_wi': np.float64(1.14984807310634), 'min_wi': np.float64(0.7961880906819326)}
Estimated reward: 0.078175
Cross-validated error: 0.000860
Final score CI (reward +- 2*error): [0.076455, 0.079895]
Standard error: 0.002870
Final t_dist CI (reward +- t_0.975*se_hat): [0.072549, 0.083801]
[I 2025-10-30 10:37:10,745] Trial 19 finished with value: 0.07645546832418647 and parameters: {'lr': 0.0035825808692318537, 'num_epochs': 4, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.8673327927964085}. Best is trial 2 with value: 0.0791962370252024.





Num samples is 10000
{'gini': np.float64(0.48036454227576), 'ess': np.float64(4271.377590547567), 'max_wi': np.float64(24.576691778849106), 'min_wi': np.float64(0.0124287350975967)}


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08647502,0.0875,0.08750948,0.09359878,0.08782022,0.08782022,0.80232812,0.0,0.84032376,0.0
15000,0.08646534,0.07892782,0.08790025,0.09306271,0.07930589,0.07930759,0.80160775,0.00632052,0.84015372,0.00612846


In [None]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0866,0.08653168,0.09008066,0.08644724,0.08644724,0.7569287,0.0,0.87627132,0.0
15000,0.08610802,0.07770118,0.08627892,0.08844592,0.07717319,0.07717316,0.75692672,0.00057714,0.87628001,0.00071563


### Poicy Via argmax(r_hat - error_hat) through cross validation

In [None]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0866,0.08653168,0.09008066,0.08644724,0.08644724,0.7569287,0.0,0.87627132,0.0
15000,0.08610802,0.07770118,0.08627892,0.08844592,0.07717319,0.07717316,0.75692672,0.00057714,0.87628001,0.00071563


### Policy Via using actual policy value

In [None]:
# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0866,0.08653168,0.09008066,0.08644724,0.08644724,0.7569287,0.0,0.87627132,0.0
15000,0.08610802,0.07770118,0.08627892,0.08844592,0.07717319,0.07717316,0.75692672,0.00057714,0.87628001,0.00071563
