In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import sys

sys.path.append("/code")

from tqdm import tqdm
import torch
import time
# device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# import gym
# import recogym

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

torch.backends.cudnn.benchmark = torch.cuda.is_available()
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")  # TF32 = big speedup on Ada


from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
import optuna
# from memory_profiler import profile


from estimators import (
    DirectMethod as DM
)

from simulation_utils import (
    eval_policy,
    generate_dataset,
    create_simulation_data_from_pi,
    get_train_data,
    get_opl_results_dict,
    CustomCFDataset,
    calc_reward,
    get_weights_info
)

from models import (    
    LinearCFModel,
    NeighborhoodModel,
    BPRModel, 
    RegressionModel
)

from training_utils import (
    train,
    validation_loop, 
    cv_score_model
 )

from trainer_trials import (
    regression_trainer_trial,
    neighberhoodmodel_trainer_trial
)

random_state=12345
random_ = check_random_state(random_state)

pd.options.display.float_format = '{:,.8f}'.format

Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [2]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params)

Random Item CTR: 0.07066414727263938
Optimal greedy CTR: 0.09999926940951757
Second Best greedy CTR: 0.0980913477695915
Optimal Stochastic CTR: 0.09995326955796031
second Best Stochastic CTR: 0.08595012935428775
Our Initial CTR: 0.08610747363354625


In [3]:
num_runs = 1
batch_size = 200
num_neighbors = 6
n_trials_for_optuna = 20
# num_rounds_list = [500, 1000, 2000, 10000, 20000]
# num_rounds_list = [500, 1000, 2000]
num_rounds_list = [15000]


# Manually define your best parameters
best_params_to_use = {
    "lr": 0.096,  # Learning rate
    "num_epochs": 5,  # Number of training epochs
    "batch_size": 64,  # Batch size for training
    "num_neighbors": 8,  # Number of neighbors for neighborhood model
    "lr_decay": 0.85  # Learning rate decay factor
}


### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [4]:
print("Value of num_rounds_list:", num_rounds_list)

# Run the optimization
df4, best_hyperparams_by_size = regression_trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# # Print best hyperparameters for each training size
# print("\n=== BEST HYPERPARAMETERS BY TRAINING SIZE ===")
# for train_size, params in best_hyperparams_by_size.items():
#     print(f"\nTraining Size: {train_size}")
#     # print(f"Best Reward: {params['reward']:.6f}")
#     print("Parameters:")
#     for param_name, value in params['params'].items():
#         print(f"  {param_name}: {value}")
# print("===========================\n")

# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Value of num_rounds_list: [15000]
Simulation time for 10000 samples: 0.028542041778564453 seconds
Baseline regression model fit time: 0.08s
Num samples is 10000
{'gini': np.float64(0.47938478287856773), 'ess': np.float64(4412.980365422003), 'max_wi': np.float64(18.678159503840167), 'min_wi': np.float64(0.010114522662821101)}
Eval time: 0.5043179988861084 seconds


[I 2025-11-11 10:27:23,184] A new study created in memory with name: no-name-447d1e9c-5150-4447-ba86-d91f1180984b


Evaluation total results time: 0.8915278911590576 seconds

=== Training size 15000, run 0 ===
Simulation time for 25000 samples: 0.06262540817260742 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


[Optuna Trial 0]


Best trial: 0. Best value: 0.0759826:   5%|▌         | 1/20 [00:04<01:28,  4.67s/it]

actual reward: [0.08458993]
Validation weights_info: {'gini': np.float64(0.9859627791219325), 'ess': np.float64(124.44787013314321), 'max_wi': np.float64(308.5969940140047), 'min_wi': np.float64(0.0)}
Estimated reward: 0.084674
Cross-validated error: 0.004346
Final score CI (reward +- 2*error): [0.075983, 0.093365]
Standard error: 0.013482
Final t_dist CI (reward +- t_0.975*se_hat): [0.058246, 0.111102]
[I 2025-11-11 10:27:27,857] Trial 0 finished with value: 0.07598257637768299 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.85}. Best is trial 0 with value: 0.07598257637768299.

[Optuna Trial 1]


Best trial: 0. Best value: 0.0759826:  10%|█         | 2/20 [00:06<00:49,  2.76s/it]

actual reward: [0.08722355]
Validation weights_info: {'gini': np.float64(0.9944037292764729), 'ess': np.float64(17.878947747081554), 'max_wi': np.float64(1570.7826475132736), 'min_wi': np.float64(4.382809920562043e-08)}
[I 2025-11-11 10:27:29,273] Trial 1 finished with value: -inf and parameters: {'lr': 0.01698430830627312, 'num_epochs': 2, 'batch_size': 128, 'lr_decay': 0.8980590779889003}. Best is trial 0 with value: 0.07598257637768299.

[Optuna Trial 2]


Best trial: 2. Best value: 0.0828134:  15%|█▌        | 3/20 [00:09<00:52,  3.06s/it]

actual reward: [0.08642795]
Validation weights_info: {'gini': np.float64(0.6002421303269609), 'ess': np.float64(3074.82191189411), 'max_wi': np.float64(9.731467350443154), 'min_wi': np.float64(0.028144125321166957)}
Estimated reward: 0.085191
Cross-validated error: 0.001189
Final score CI (reward +- 2*error): [0.082813, 0.087569]
Standard error: 0.005246
Final t_dist CI (reward +- t_0.975*se_hat): [0.074907, 0.095475]
[I 2025-11-11 10:27:32,692] Trial 2 finished with value: 0.08281340598203045 and parameters: {'lr': 0.0006237426708863768, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.8870314718624671}. Best is trial 2 with value: 0.08281340598203045.

[Optuna Trial 3]


Best trial: 2. Best value: 0.0828134:  20%|██        | 4/20 [00:11<00:42,  2.68s/it]

actual reward: [0.08618316]
Validation weights_info: {'gini': np.float64(0.13428828214217745), 'ess': np.float64(9362.13723055454), 'max_wi': np.float64(1.8341045669290077), 'min_wi': np.float64(0.5737627555070604)}
Estimated reward: 0.081233
Cross-validated error: 0.000546
Final score CI (reward +- 2*error): [0.080141, 0.082326]
Standard error: 0.002829
Final t_dist CI (reward +- t_0.975*se_hat): [0.075688, 0.086778]
[I 2025-11-11 10:27:34,793] Trial 3 finished with value: 0.08014051310095206 and parameters: {'lr': 0.0002174304320759454, 'num_epochs': 8, 'batch_size': 256, 'lr_decay': 0.8784513934143364}. Best is trial 2 with value: 0.08281340598203045.

[Optuna Trial 4]


Best trial: 2. Best value: 0.0828134:  25%|██▌       | 5/20 [00:12<00:33,  2.20s/it]

actual reward: [0.08659803]
Validation weights_info: {'gini': np.float64(0.9932485259780186), 'ess': np.float64(25.75912611994745), 'max_wi': np.float64(976.0606520813478), 'min_wi': np.float64(6.996381850487356e-08)}
[I 2025-11-11 10:27:36,150] Trial 4 finished with value: -inf and parameters: {'lr': 0.014402735618069701, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8397569122699707}. Best is trial 2 with value: 0.08281340598203045.

[Optuna Trial 5]


Best trial: 5. Best value: 0.137256:  30%|███       | 6/20 [00:14<00:28,  2.01s/it] 

actual reward: [0.08808549]
Validation weights_info: {'gini': np.float64(0.9591031808700695), 'ess': np.float64(106.4798569057156), 'max_wi': np.float64(435.18463446999857), 'min_wi': np.float64(1.979846964293483e-06)}
Estimated reward: 0.159137
Cross-validated error: 0.010940
Final score CI (reward +- 2*error): [0.137256, 0.181017]
Standard error: 0.050685
Final t_dist CI (reward +- t_0.975*se_hat): [0.059784, 0.258490]
[I 2025-11-11 10:27:37,795] Trial 5 finished with value: 0.13725643572441715 and parameters: {'lr': 0.006686590258371187, 'num_epochs': 3, 'batch_size': 128, 'lr_decay': 0.9040758369283982}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 6]


Best trial: 5. Best value: 0.137256:  35%|███▌      | 7/20 [00:15<00:23,  1.78s/it]

actual reward: [0.08647428]
Validation weights_info: {'gini': np.float64(0.5225030156172029), 'ess': np.float64(4059.2931942028213), 'max_wi': np.float64(7.800665612449255), 'min_wi': np.float64(0.07953786198317123)}
Estimated reward: 0.082870
Cross-validated error: 0.000914
Final score CI (reward +- 2*error): [0.081042, 0.084698]
Standard error: 0.004436
Final t_dist CI (reward +- t_0.975*se_hat): [0.074175, 0.091565]
[I 2025-11-11 10:27:39,096] Trial 6 finished with value: 0.08104151224886166 and parameters: {'lr': 0.003965207068032078, 'num_epochs': 2, 'batch_size': 512, 'lr_decay': 0.9093252493579274}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 7]


Best trial: 5. Best value: 0.137256:  40%|████      | 8/20 [00:18<00:24,  2.02s/it]

actual reward: [0.08589233]
Validation weights_info: {'gini': np.float64(0.9869480748320233), 'ess': np.float64(20.97040067336874), 'max_wi': np.float64(1629.9068586650649), 'min_wi': np.float64(2.984119396713926e-08)}
[I 2025-11-11 10:27:41,632] Trial 7 finished with value: -inf and parameters: {'lr': 0.006063453656336662, 'num_epochs': 5, 'batch_size': 128, 'lr_decay': 0.9733644565158797}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 8]


Best trial: 5. Best value: 0.137256:  45%|████▌     | 9/20 [00:20<00:23,  2.09s/it]

actual reward: [0.08656925]
Validation weights_info: {'gini': np.float64(0.6997721564231802), 'ess': np.float64(2116.9570956923326), 'max_wi': np.float64(14.278893700766275), 'min_wi': np.float64(0.012372571204856068)}
Estimated reward: 0.086436
Cross-validated error: 0.001280
Final score CI (reward +- 2*error): [0.083875, 0.088997]
Standard error: 0.006362
Final t_dist CI (reward +- t_0.975*se_hat): [0.073964, 0.098908]
[I 2025-11-11 10:27:43,890] Trial 8 finished with value: 0.08387509270249557 and parameters: {'lr': 0.0013775023366228625, 'num_epochs': 7, 'batch_size': 256, 'lr_decay': 0.9039080997333484}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 9]


Best trial: 5. Best value: 0.137256:  50%|█████     | 10/20 [00:23<00:22,  2.25s/it]

actual reward: [0.08597508]
Validation weights_info: {'gini': np.float64(0.9928786762905426), 'ess': np.float64(7.442088806043891), 'max_wi': np.float64(2525.9740155605305), 'min_wi': np.float64(4.614226971703069e-23)}
[I 2025-11-11 10:27:46,500] Trial 9 finished with value: -inf and parameters: {'lr': 0.042089078613685514, 'num_epochs': 3, 'batch_size': 64, 'lr_decay': 0.8682400630188623}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 10]


Best trial: 5. Best value: 0.137256:  55%|█████▌    | 11/20 [00:26<00:23,  2.67s/it]

actual reward: [0.08850869]
Validation weights_info: {'gini': np.float64(0.8815038419063125), 'ess': np.float64(896.0423932985949), 'max_wi': np.float64(54.27367293748563), 'min_wi': np.float64(0.0002708201550708237)}
Estimated reward: 0.090004
Cross-validated error: 0.002124
Final score CI (reward +- 2*error): [0.085755, 0.094252]
Standard error: 0.009715
Final t_dist CI (reward +- t_0.975*se_hat): [0.070959, 0.109048]
[I 2025-11-11 10:27:50,098] Trial 10 finished with value: 0.08575504440983264 and parameters: {'lr': 0.0012669373924016829, 'num_epochs': 10, 'batch_size': 128, 'lr_decay': 0.9589118842816615}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 11]


Best trial: 5. Best value: 0.137256:  60%|██████    | 12/20 [00:30<00:23,  2.94s/it]

actual reward: [0.08794378]
Validation weights_info: {'gini': np.float64(0.8705862389032295), 'ess': np.float64(989.3062428103322), 'max_wi': np.float64(38.36371724977584), 'min_wi': np.float64(0.000248007602025034)}
Estimated reward: 0.093409
Cross-validated error: 0.002125
Final score CI (reward +- 2*error): [0.089159, 0.097658]
Standard error: 0.009823
Final t_dist CI (reward +- t_0.975*se_hat): [0.074154, 0.112663]
[I 2025-11-11 10:27:53,671] Trial 11 finished with value: 0.08915919849920845 and parameters: {'lr': 0.001276765778173941, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.9521580644845486}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 12]


Best trial: 5. Best value: 0.137256:  65%|██████▌   | 13/20 [00:34<00:22,  3.23s/it]

actual reward: [0.08618864]
Validation weights_info: {'gini': np.float64(0.15036034249168675), 'ess': np.float64(9205.22478373253), 'max_wi': np.float64(2.0242360761079357), 'min_wi': np.float64(0.5157643339688064)}
Estimated reward: 0.081306
Cross-validated error: 0.000468
Final score CI (reward +- 2*error): [0.080369, 0.082242]
Standard error: 0.002856
Final t_dist CI (reward +- t_0.975*se_hat): [0.075707, 0.086904]
[I 2025-11-11 10:27:57,562] Trial 12 finished with value: 0.08036924468855469 and parameters: {'lr': 0.00011002321159576372, 'num_epochs': 10, 'batch_size': 128, 'lr_decay': 0.9357340516817942}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 13]


Best trial: 5. Best value: 0.137256:  70%|███████   | 14/20 [00:37<00:19,  3.17s/it]

actual reward: [0.08742623]
Validation weights_info: {'gini': np.float64(0.8489396652539292), 'ess': np.float64(1129.7645204770058), 'max_wi': np.float64(27.07837882504314), 'min_wi': np.float64(0.0008564097849744537)}
Estimated reward: 0.092335
Cross-validated error: 0.001822
Final score CI (reward +- 2*error): [0.088692, 0.095979]
Standard error: 0.008985
Final t_dist CI (reward +- t_0.975*se_hat): [0.074722, 0.109948]
[I 2025-11-11 10:28:00,585] Trial 13 finished with value: 0.08869163792576382 and parameters: {'lr': 0.0020558859646558265, 'num_epochs': 7, 'batch_size': 128, 'lr_decay': 0.8070298241497896}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 14]


Best trial: 5. Best value: 0.137256:  75%|███████▌  | 15/20 [00:38<00:13,  2.63s/it]

actual reward: [0.08612766]
Validation weights_info: {'gini': np.float64(0.037016166601142424), 'ess': np.float64(9951.334490488664), 'max_wi': np.float64(1.1967329322872728), 'min_wi': np.float64(0.8580287607028605)}
Estimated reward: 0.081127
Cross-validated error: 0.000457
Final score CI (reward +- 2*error): [0.080213, 0.082041]
Standard error: 0.002740
Final t_dist CI (reward +- t_0.975*se_hat): [0.075757, 0.086498]
[I 2025-11-11 10:28:01,965] Trial 14 finished with value: 0.08021308422828564 and parameters: {'lr': 0.0005394137483570624, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9988398992611773}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 15]


Best trial: 5. Best value: 0.137256:  80%|████████  | 16/20 [00:42<00:11,  2.82s/it]

actual reward: [0.08834421]
Validation weights_info: {'gini': np.float64(0.9924112356582095), 'ess': np.float64(44.550520657393186), 'max_wi': np.float64(495.7324448182748), 'min_wi': np.float64(7.891780957615338e-10)}
[I 2025-11-11 10:28:05,239] Trial 15 finished with value: -inf and parameters: {'lr': 0.007216884033806365, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.9352492988931008}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 16]


Best trial: 5. Best value: 0.137256:  85%|████████▌ | 17/20 [00:43<00:07,  2.55s/it]

actual reward: [0.08628897]
Validation weights_info: {'gini': np.float64(0.3012652811082066), 'ess': np.float64(7188.171162005575), 'max_wi': np.float64(3.5123758165295538), 'min_wi': np.float64(0.23814021371319496)}
Estimated reward: 0.081882
Cross-validated error: 0.000644
Final score CI (reward +- 2*error): [0.080595, 0.083169]
Standard error: 0.003277
Final t_dist CI (reward +- t_0.975*se_hat): [0.075460, 0.088305]
[I 2025-11-11 10:28:07,156] Trial 16 finished with value: 0.08059511285539601 and parameters: {'lr': 0.0005865338768933781, 'num_epochs': 3, 'batch_size': 128, 'lr_decay': 0.9320183439982285}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 17]


Best trial: 5. Best value: 0.137256:  90%|█████████ | 18/20 [00:47<00:05,  2.86s/it]

actual reward: [0.09145844]
Validation weights_info: {'gini': np.float64(0.9926637292375193), 'ess': np.float64(23.699055987709574), 'max_wi': np.float64(1057.183054048928), 'min_wi': np.float64(8.931970511888566e-13)}
[I 2025-11-11 10:28:10,728] Trial 17 finished with value: -inf and parameters: {'lr': 0.014388582953449419, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.9640713391769928}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 18]


Best trial: 5. Best value: 0.137256:  95%|█████████▌| 19/20 [00:49<00:02,  2.53s/it]

actual reward: [0.08718457]
Validation weights_info: {'gini': np.float64(0.8290466477862605), 'ess': np.float64(1185.5333132249186), 'max_wi': np.float64(24.52752493440239), 'min_wi': np.float64(0.0027939988338544267)}
Estimated reward: 0.090637
Cross-validated error: 0.001636
Final score CI (reward +- 2*error): [0.087365, 0.093909]
Standard error: 0.008624
Final t_dist CI (reward +- t_0.975*se_hat): [0.073732, 0.107542]
[I 2025-11-11 10:28:12,504] Trial 18 finished with value: 0.08736538588751407 and parameters: {'lr': 0.0030740049421174613, 'num_epochs': 6, 'batch_size': 512, 'lr_decay': 0.9917228252797541}. Best is trial 5 with value: 0.13725643572441715.

[Optuna Trial 19]


Best trial: 5. Best value: 0.137256: 100%|██████████| 20/20 [00:51<00:00,  2.58s/it]

actual reward: [0.08621265]
Validation weights_info: {'gini': np.float64(0.9954353124049323), 'ess': np.float64(14.315313007486917), 'max_wi': np.float64(764.5603361410659), 'min_wi': np.float64(1.7701327391037317e-15)}
[I 2025-11-11 10:28:14,709] Trial 19 finished with value: -inf and parameters: {'lr': 0.030729093147369196, 'num_epochs': 4, 'batch_size': 128, 'lr_decay': 0.9239069742318192}. Best is trial 5 with value: 0.13725643572441715.





Num samples is 10000
{'gini': np.float64(0.9475989944073474), 'ess': np.float64(268.2197693309632), 'max_wi': np.float64(168.03307877306435), 'min_wi': np.float64(6.980416773862794e-06)}
Eval time: 0.4715700149536133 seconds
Evaluation total results time: 0.8322534561157227 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0863,0.08627646,0.08627646,0.08629749,0.08629749,0.7569287,0.0,0.87627132,0.0
15000,0.08934609,0.10289728,0.08830008,0.08830008,0.10207247,0.10220177,0.83985308,0.23912189,0.92899388,0.15493227


### Policy with delta function

In [5]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=10000)

Random Item CTR: 0.07083863592474163
Optimal greedy CTR: 0.09999916436977967
Second Best greedy CTR: 0.08797326118616329
Optimal Stochastic CTR: 0.0999493542444427
second Best Stochastic CTR: 0.0854530317781557
Our Initial CTR: 0.08557719469284641


In [6]:
# Run the optimization
df5, best_hyperparams_by_size = regression_trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df5[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.029740333557128906 seconds
Baseline regression model fit time: 0.09s
Num samples is 10000
{'gini': np.float64(0.4604299354253158), 'ess': np.float64(4493.6910531878375), 'max_wi': np.float64(24.937589069287892), 'min_wi': np.float64(0.009548594335357768)}
Eval time: 0.5004150867462158 seconds


[I 2025-11-11 10:28:18,176] A new study created in memory with name: no-name-a7c72271-b06c-4e38-8d21-09c16ff437d6


Evaluation total results time: 0.8784444332122803 seconds

=== Training size 15000, run 0 ===
Simulation time for 25000 samples: 0.06392955780029297 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


[Optuna Trial 0]


Best trial: 0. Best value: -inf:   5%|▌         | 1/20 [00:03<01:00,  3.18s/it]

actual reward: [0.07926111]
Validation weights_info: {'gini': np.float64(0.9948994935855698), 'ess': np.float64(3.111598354656386), 'max_wi': np.float64(7742.407701808362), 'min_wi': np.float64(0.0)}
[I 2025-11-11 10:28:21,350] Trial 0 finished with value: -inf and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.85}. Best is trial 0 with value: -inf.

[Optuna Trial 1]


Best trial: 1. Best value: 0.0823315:  10%|█         | 2/20 [00:05<00:49,  2.74s/it]

actual reward: [0.08071638]
Validation weights_info: {'gini': np.float64(0.9815602267975135), 'ess': np.float64(193.53400683801166), 'max_wi': np.float64(57.011024934313944), 'min_wi': np.float64(6.143231431839543e-29)}
Estimated reward: 0.095724
Cross-validated error: 0.006696
Final score CI (reward +- 2*error): [0.082331, 0.109116]
Standard error: 0.007452
Final t_dist CI (reward +- t_0.975*se_hat): [0.081116, 0.110331]
[I 2025-11-11 10:28:23,792] Trial 1 finished with value: 0.08233145803319308 and parameters: {'lr': 0.04325060875498211, 'num_epochs': 3, 'batch_size': 64, 'lr_decay': 0.9941051480966848}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 2]


Best trial: 1. Best value: 0.0823315:  15%|█▌        | 3/20 [00:06<00:35,  2.10s/it]

actual reward: [0.08550164]
Validation weights_info: {'gini': np.float64(0.055848754592746026), 'ess': np.float64(9879.944237835954), 'max_wi': np.float64(1.3503342517625292), 'min_wi': np.float64(0.7867379851081521)}
Estimated reward: 0.077419
Cross-validated error: 0.000512
Final score CI (reward +- 2*error): [0.076396, 0.078442]
Standard error: 0.002691
Final t_dist CI (reward +- t_0.975*se_hat): [0.072144, 0.082694]
[I 2025-11-11 10:28:25,118] Trial 2 finished with value: 0.07639592154962857 and parameters: {'lr': 0.00013584570570962097, 'num_epochs': 3, 'batch_size': 256, 'lr_decay': 0.9685543051034167}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 3]


Best trial: 1. Best value: 0.0823315:  20%|██        | 4/20 [00:08<00:28,  1.75s/it]

actual reward: [0.08301486]
Validation weights_info: {'gini': np.float64(0.9983370086774102), 'ess': np.float64(5.836827866388359), 'max_wi': np.float64(2122.1629376534606), 'min_wi': np.float64(2.153822536017958e-11)}
[I 2025-11-11 10:28:26,343] Trial 3 finished with value: -inf and parameters: {'lr': 0.03188758259009122, 'num_epochs': 2, 'batch_size': 256, 'lr_decay': 0.829173653582937}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 4]


Best trial: 1. Best value: 0.0823315:  25%|██▌       | 5/20 [00:09<00:25,  1.67s/it]

actual reward: [0.08544358]
Validation weights_info: {'gini': np.float64(0.09351289573836195), 'ess': np.float64(9659.534377098083), 'max_wi': np.float64(1.6400148045177072), 'min_wi': np.float64(0.6607249310792992)}
Estimated reward: 0.077392
Cross-validated error: 0.000515
Final score CI (reward +- 2*error): [0.076363, 0.078422]
Standard error: 0.002721
Final t_dist CI (reward +- t_0.975*se_hat): [0.072059, 0.082726]
[I 2025-11-11 10:28:27,860] Trial 4 finished with value: 0.07636278161592831 and parameters: {'lr': 0.00016602827504398602, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.9706720716641333}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 5]


Best trial: 1. Best value: 0.0823315:  30%|███       | 6/20 [00:11<00:23,  1.71s/it]

actual reward: [0.08765528]
Validation weights_info: {'gini': np.float64(0.993312791118524), 'ess': np.float64(11.666677075039782), 'max_wi': np.float64(1106.3663520859016), 'min_wi': np.float64(1.8672657260620447e-16)}
[I 2025-11-11 10:28:29,653] Trial 5 finished with value: -inf and parameters: {'lr': 0.0401163783306309, 'num_epochs': 7, 'batch_size': 256, 'lr_decay': 0.9246616889154258}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 6]


Best trial: 1. Best value: 0.0823315:  35%|███▌      | 7/20 [00:13<00:22,  1.69s/it]

actual reward: [0.08627803]
Validation weights_info: {'gini': np.float64(0.9974101283659192), 'ess': np.float64(12.827569448164983), 'max_wi': np.float64(920.7877512487391), 'min_wi': np.float64(8.864078998180866e-12)}
[I 2025-11-11 10:28:31,316] Trial 6 finished with value: -inf and parameters: {'lr': 0.027966014247410194, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8809172153118924}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 7]


Best trial: 1. Best value: 0.0823315:  40%|████      | 8/20 [00:19<00:36,  3.05s/it]

actual reward: [0.08392448]
Validation weights_info: {'gini': np.float64(0.889882209731742), 'ess': np.float64(749.3713401224691), 'max_wi': np.float64(87.72823655880858), 'min_wi': np.float64(0.0003227516901492897)}
Estimated reward: 0.075501
Cross-validated error: 0.001625
Final score CI (reward +- 2*error): [0.072251, 0.078752]
Standard error: 0.008209
Final t_dist CI (reward +- t_0.975*se_hat): [0.059410, 0.091593]
[I 2025-11-11 10:28:37,278] Trial 7 finished with value: 0.07225064451647249 and parameters: {'lr': 0.0006253710386305763, 'num_epochs': 10, 'batch_size': 64, 'lr_decay': 0.9357959251591885}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 8]


Best trial: 1. Best value: 0.0823315:  45%|████▌     | 9/20 [00:20<00:28,  2.57s/it]

actual reward: [0.08275656]
Validation weights_info: {'gini': np.float64(0.8707721515214227), 'ess': np.float64(1029.2568977865608), 'max_wi': np.float64(35.51905537213541), 'min_wi': np.float64(0.0006817362010623573)}
Estimated reward: 0.072825
Cross-validated error: 0.001725
Final score CI (reward +- 2*error): [0.069375, 0.076275]
Standard error: 0.007593
Final t_dist CI (reward +- t_0.975*se_hat): [0.057941, 0.087709]
[I 2025-11-11 10:28:38,800] Trial 8 finished with value: 0.06937530509923881 and parameters: {'lr': 0.003552164058126792, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8011779933483303}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 9]


Best trial: 1. Best value: 0.0823315:  50%|█████     | 10/20 [00:23<00:27,  2.73s/it]

actual reward: [0.08486882]
Validation weights_info: {'gini': np.float64(0.3409204118782075), 'ess': np.float64(6281.341322587773), 'max_wi': np.float64(4.872961447873544), 'min_wi': np.float64(0.1732908686019681)}
Estimated reward: 0.077625
Cross-validated error: 0.000727
Final score CI (reward +- 2*error): [0.076170, 0.079080]
Standard error: 0.003357
Final t_dist CI (reward +- t_0.975*se_hat): [0.071046, 0.084205]
[I 2025-11-11 10:28:41,882] Trial 9 finished with value: 0.07617048397599238 and parameters: {'lr': 0.0001825371019062099, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.9669611711083667}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 10]


Best trial: 1. Best value: 0.0823315:  55%|█████▌    | 11/20 [00:24<00:19,  2.18s/it]

actual reward: [0.08477773]
Validation weights_info: {'gini': np.float64(0.37925219368652446), 'ess': np.float64(5798.738268857747), 'max_wi': np.float64(4.897276621301122), 'min_wi': np.float64(0.16607586667056815)}
Estimated reward: 0.078133
Cross-validated error: 0.000725
Final score CI (reward +- 2*error): [0.076683, 0.079582]
Standard error: 0.003516
Final t_dist CI (reward +- t_0.975*se_hat): [0.071241, 0.085024]
[I 2025-11-11 10:28:42,808] Trial 10 finished with value: 0.07668343230806178 and parameters: {'lr': 0.0041733173015305524, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9986357357519391}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 11]


Best trial: 1. Best value: 0.0823315:  60%|██████    | 12/20 [00:25<00:14,  1.84s/it]

actual reward: [0.08453366]
Validation weights_info: {'gini': np.float64(0.43725304020957845), 'ess': np.float64(4980.793006030885), 'max_wi': np.float64(5.635447468115189), 'min_wi': np.float64(0.11321610109693875)}
Estimated reward: 0.077985
Cross-validated error: 0.000607
Final score CI (reward +- 2*error): [0.076771, 0.079199]
Standard error: 0.003768
Final t_dist CI (reward +- t_0.975*se_hat): [0.070599, 0.085371]
[I 2025-11-11 10:28:43,888] Trial 11 finished with value: 0.07677061594067587 and parameters: {'lr': 0.004730571136031322, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9989657717209731}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 12]


Best trial: 1. Best value: 0.0823315:  65%|██████▌   | 13/20 [00:26<00:11,  1.58s/it]

actual reward: [0.08289439]
Validation weights_info: {'gini': np.float64(0.6689527210237688), 'ess': np.float64(2390.929185406415), 'max_wi': np.float64(13.721406188078403), 'min_wi': np.float64(0.021589635316437355)}
Estimated reward: 0.076321
Cross-validated error: 0.000879
Final score CI (reward +- 2*error): [0.074563, 0.078078]
Standard error: 0.005231
Final t_dist CI (reward +- t_0.975*se_hat): [0.066067, 0.086575]
[I 2025-11-11 10:28:44,874] Trial 12 finished with value: 0.07456347484260568 and parameters: {'lr': 0.00814643676159549, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9988146036027228}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 13]


Best trial: 1. Best value: 0.0823315:  70%|███████   | 14/20 [00:27<00:08,  1.42s/it]

actual reward: [0.08528935]
Validation weights_info: {'gini': np.float64(0.18084691467179637), 'ess': np.float64(8753.078383535742), 'max_wi': np.float64(2.42054564278245), 'min_wi': np.float64(0.45079444007075314)}
Estimated reward: 0.077756
Cross-validated error: 0.000560
Final score CI (reward +- 2*error): [0.076635, 0.078876]
Standard error: 0.002876
Final t_dist CI (reward +- t_0.975*se_hat): [0.072119, 0.083393]
[I 2025-11-11 10:28:45,916] Trial 13 finished with value: 0.07663511381407387 and parameters: {'lr': 0.0010780788751966056, 'num_epochs': 2, 'batch_size': 512, 'lr_decay': 0.9318855792684234}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 14]


Best trial: 1. Best value: 0.0823315:  75%|███████▌  | 15/20 [00:29<00:07,  1.54s/it]

actual reward: [0.08483119]
Validation weights_info: {'gini': np.float64(0.995195987812627), 'ess': np.float64(3.1457652284113533), 'max_wi': np.float64(3733.5959299344795), 'min_wi': np.float64(4.247920723727444e-09)}
[I 2025-11-11 10:28:47,739] Trial 14 finished with value: -inf and parameters: {'lr': 0.010903854640575647, 'num_epochs': 2, 'batch_size': 64, 'lr_decay': 0.8988883526365501}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 15]


Best trial: 1. Best value: 0.0823315:  80%|████████  | 16/20 [00:32<00:07,  1.88s/it]

actual reward: [0.08211782]
Validation weights_info: {'gini': np.float64(0.8537611941570338), 'ess': np.float64(1081.2262620483052), 'max_wi': np.float64(33.36058623501828), 'min_wi': np.float64(0.0009245299395343392)}
Estimated reward: 0.072332
Cross-validated error: 0.001440
Final score CI (reward +- 2*error): [0.069452, 0.075211]
Standard error: 0.007376
Final t_dist CI (reward +- t_0.975*se_hat): [0.057874, 0.086790]
[I 2025-11-11 10:28:50,393] Trial 15 finished with value: 0.06945235311618576 and parameters: {'lr': 0.0009890130697283574, 'num_epochs': 6, 'batch_size': 128, 'lr_decay': 0.9567185556209279}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 16]


Best trial: 1. Best value: 0.0823315:  85%|████████▌ | 17/20 [00:33<00:05,  1.72s/it]

actual reward: [0.08757711]
Validation weights_info: {'gini': np.float64(0.940684744397744), 'ess': np.float64(172.54748355508238), 'max_wi': np.float64(257.9244783729492), 'min_wi': np.float64(1.7196248940923803e-05)}
Estimated reward: 0.091479
Cross-validated error: 0.004723
Final score CI (reward +- 2*error): [0.082034, 0.100924]
Standard error: 0.020358
Final t_dist CI (reward +- t_0.975*se_hat): [0.051573, 0.131385]
[I 2025-11-11 10:28:51,733] Trial 16 finished with value: 0.08203400698691095 and parameters: {'lr': 0.012365738209943406, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.9989417074574427}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 17]


Best trial: 1. Best value: 0.0823315:  90%|█████████ | 18/20 [00:35<00:03,  1.88s/it]

actual reward: [0.08088112]
Validation weights_info: {'gini': np.float64(0.9832377537919622), 'ess': np.float64(13.383088846075658), 'max_wi': np.float64(1965.520819227356), 'min_wi': np.float64(0.0)}
[I 2025-11-11 10:28:53,985] Trial 17 finished with value: -inf and parameters: {'lr': 0.0712873013754599, 'num_epochs': 3, 'batch_size': 64, 'lr_decay': 0.9486004650671929}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 18]


Best trial: 1. Best value: 0.0823315:  95%|█████████▌| 19/20 [00:37<00:01,  1.70s/it]

actual reward: [0.08488684]
Validation weights_info: {'gini': np.float64(0.9974211407501455), 'ess': np.float64(10.073528698429106), 'max_wi': np.float64(1476.201068876673), 'min_wi': np.float64(8.452724142914632e-09)}
[I 2025-11-11 10:28:55,269] Trial 18 finished with value: -inf and parameters: {'lr': 0.01554697205276518, 'num_epochs': 5, 'batch_size': 512, 'lr_decay': 0.9012749488355477}. Best is trial 1 with value: 0.08233145803319308.

[Optuna Trial 19]


Best trial: 1. Best value: 0.0823315: 100%|██████████| 20/20 [00:39<00:00,  1.97s/it]

actual reward: [0.08828582]
Validation weights_info: {'gini': np.float64(0.978827475601503), 'ess': np.float64(79.38455698734379), 'max_wi': np.float64(301.28295407544095), 'min_wi': np.float64(5.754433408704174e-15)}
[I 2025-11-11 10:28:57,494] Trial 19 finished with value: -inf and parameters: {'lr': 0.01960965217969486, 'num_epochs': 3, 'batch_size': 64, 'lr_decay': 0.9814068848767098}. Best is trial 1 with value: 0.08233145803319308.





Num samples is 10000
{'gini': np.float64(0.9987226202435971), 'ess': np.float64(5.388429362231723), 'max_wi': np.float64(2142.8669808181057), 'min_wi': np.float64(1.1766681604947649e-15)}
Eval time: 0.440662145614624 seconds
Evaluation total results time: 0.7724523544311523 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08557719,0.0851,0.08526555,0.08526555,0.08522026,0.08522026,0.82618217,0.0,0.99950468,0.0
15000,0.08601447,0.06632353,0.10105168,0.10105168,0.04117202,0.07005408,1.01849706,0.53131939,1.16716929,0.40423684


In [7]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=20000)

Random Item CTR: 0.07042251854546815
Optimal greedy CTR: 0.09999934264692525
Second Best greedy CTR: 0.09938443255799592
Optimal Stochastic CTR: 0.09996075464321043
second Best Stochastic CTR: 0.08632684639469405
Our Initial CTR: 0.08647580588501355


In [8]:
# Run the optimization
df6, best_hyperparams_by_size = regression_trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df6[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.031612396240234375 seconds
Baseline regression model fit time: 0.08s
Num samples is 10000
{'gini': np.float64(0.47832133724352016), 'ess': np.float64(4390.178515887508), 'max_wi': np.float64(26.085211529324706), 'min_wi': np.float64(0.013765942512278155)}
Eval time: 0.5186388492584229 seconds


[I 2025-11-11 10:29:00,637] A new study created in memory with name: no-name-b687f0e0-c293-4380-ac36-248c98666e07


Evaluation total results time: 0.9000699520111084 seconds

=== Training size 15000, run 0 ===
Simulation time for 25000 samples: 0.06534886360168457 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


[Optuna Trial 0]


Best trial: 0. Best value: -inf:   5%|▌         | 1/20 [00:03<01:06,  3.50s/it]

actual reward: [0.08642292]
Validation weights_info: {'gini': np.float64(0.9943562018804105), 'ess': np.float64(29.91480605617993), 'max_wi': np.float64(1561.0550219110826), 'min_wi': np.float64(0.0)}
[I 2025-11-11 10:29:04,136] Trial 0 finished with value: -inf and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.85}. Best is trial 0 with value: -inf.

[Optuna Trial 1]


Best trial: 1. Best value: 0.0800241:  10%|█         | 2/20 [00:05<00:44,  2.46s/it]

actual reward: [0.08685457]
Validation weights_info: {'gini': np.float64(0.16029663139070166), 'ess': np.float64(9177.000304255038), 'max_wi': np.float64(2.239245693715279), 'min_wi': np.float64(0.48065801720946416)}
Estimated reward: 0.081283
Cross-validated error: 0.000630
Final score CI (reward +- 2*error): [0.080024, 0.082543]
Standard error: 0.002898
Final t_dist CI (reward +- t_0.975*se_hat): [0.075604, 0.086963]
[I 2025-11-11 10:29:05,865] Trial 1 finished with value: 0.08002410523045916 and parameters: {'lr': 0.0007592134066367055, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8944722990066727}. Best is trial 1 with value: 0.08002410523045916.

[Optuna Trial 2]


Best trial: 1. Best value: 0.0800241:  15%|█▌        | 3/20 [00:08<00:50,  2.96s/it]

actual reward: [0.08678128]
Validation weights_info: {'gini': np.float64(0.13221422190990678), 'ess': np.float64(9433.120803271993), 'max_wi': np.float64(1.9963050605378483), 'min_wi': np.float64(0.5513977597602704)}
Estimated reward: 0.080958
Cross-validated error: 0.000572
Final score CI (reward +- 2*error): [0.079814, 0.082102]
Standard error: 0.002843
Final t_dist CI (reward +- t_0.975*se_hat): [0.075385, 0.086531]
[I 2025-11-11 10:29:09,419] Trial 2 finished with value: 0.07981417351506907 and parameters: {'lr': 0.0002294173267685256, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.9057922981021628}. Best is trial 1 with value: 0.08002410523045916.

[Optuna Trial 3]


Best trial: 3. Best value: 0.0861605:  20%|██        | 4/20 [00:10<00:39,  2.48s/it]

actual reward: [0.08875151]
Validation weights_info: {'gini': np.float64(0.8299958680047458), 'ess': np.float64(466.83599298439424), 'max_wi': np.float64(356.28193704002524), 'min_wi': np.float64(0.001921294911711908)}
Estimated reward: 0.089461
Cross-validated error: 0.001650
Final score CI (reward +- 2*error): [0.086161, 0.092761]
Standard error: 0.009075
Final t_dist CI (reward +- t_0.975*se_hat): [0.071673, 0.107249]
[I 2025-11-11 10:29:11,167] Trial 3 finished with value: 0.08616051936609728 and parameters: {'lr': 0.00925607449090513, 'num_epochs': 1, 'batch_size': 64, 'lr_decay': 0.9902542074990384}. Best is trial 3 with value: 0.08616051936609728.

[Optuna Trial 4]


Best trial: 3. Best value: 0.0861605:  25%|██▌       | 5/20 [00:15<00:51,  3.42s/it]

actual reward: [0.08832543]
Validation weights_info: {'gini': np.float64(0.9876629750154217), 'ess': np.float64(34.856420182445206), 'max_wi': np.float64(1290.945992778856), 'min_wi': np.float64(6.789449756499229e-09)}
[I 2025-11-11 10:29:16,239] Trial 4 finished with value: -inf and parameters: {'lr': 0.0059096657118626215, 'num_epochs': 8, 'batch_size': 64, 'lr_decay': 0.9051672436818909}. Best is trial 3 with value: 0.08616051936609728.

[Optuna Trial 5]


Best trial: 5. Best value: 0.0899304:  30%|███       | 6/20 [00:16<00:36,  2.63s/it]

actual reward: [0.08867395]
Validation weights_info: {'gini': np.float64(0.7752191426014662), 'ess': np.float64(1211.1704156304913), 'max_wi': np.float64(39.91373682041884), 'min_wi': np.float64(0.0041062011121194624)}
Estimated reward: 0.093258
Cross-validated error: 0.001664
Final score CI (reward +- 2*error): [0.089930, 0.096585]
Standard error: 0.008586
Final t_dist CI (reward +- t_0.975*se_hat): [0.076427, 0.110088]
[I 2025-11-11 10:29:17,357] Trial 5 finished with value: 0.08993042580543288 and parameters: {'lr': 0.010360865602846955, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.8307245830501109}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 6]


Best trial: 5. Best value: 0.0899304:  35%|███▌      | 7/20 [00:17<00:27,  2.14s/it]

actual reward: [0.08660265]
Validation weights_info: {'gini': np.float64(0.055031574858890625), 'ess': np.float64(9901.315512483721), 'max_wi': np.float64(1.330621221836836), 'min_wi': np.float64(0.7920922864888891)}
Estimated reward: 0.080217
Cross-validated error: 0.000548
Final score CI (reward +- 2*error): [0.079122, 0.081312]
Standard error: 0.002742
Final t_dist CI (reward +- t_0.975*se_hat): [0.074842, 0.085593]
[I 2025-11-11 10:29:18,480] Trial 6 finished with value: 0.07912226405655502 and parameters: {'lr': 0.0013999434719654787, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.8322956580990247}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 7]


Best trial: 5. Best value: 0.0899304:  40%|████      | 8/20 [00:20<00:29,  2.42s/it]

actual reward: [0.08903702]
Validation weights_info: {'gini': np.float64(0.9509053514065124), 'ess': np.float64(161.62372305131584), 'max_wi': np.float64(455.01241134331696), 'min_wi': np.float64(6.233483068906169e-06)}
Estimated reward: 0.070100
Cross-validated error: 0.002483
Final score CI (reward +- 2*error): [0.065134, 0.075067]
Standard error: 0.010476
Final t_dist CI (reward +- t_0.975*se_hat): [0.049565, 0.090636]
[I 2025-11-11 10:29:21,495] Trial 7 finished with value: 0.06513418460143784 and parameters: {'lr': 0.005674668226825837, 'num_epochs': 4, 'batch_size': 64, 'lr_decay': 0.8605564568115247}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 8]


Best trial: 5. Best value: 0.0899304:  45%|████▌     | 9/20 [00:22<00:23,  2.16s/it]

actual reward: [0.08771961]
Validation weights_info: {'gini': np.float64(0.9945711194830357), 'ess': np.float64(6.7926312939928), 'max_wi': np.float64(2901.087381940656), 'min_wi': np.float64(1.5234548114813348e-11)}
[I 2025-11-11 10:29:23,093] Trial 8 finished with value: -inf and parameters: {'lr': 0.038116458424273594, 'num_epochs': 2, 'batch_size': 128, 'lr_decay': 0.8053235374109577}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 9]


Best trial: 5. Best value: 0.0899304:  50%|█████     | 10/20 [00:26<00:27,  2.72s/it]

actual reward: [0.08807513]
Validation weights_info: {'gini': np.float64(0.5813796872944964), 'ess': np.float64(3295.565927999407), 'max_wi': np.float64(15.825752045589297), 'min_wi': np.float64(0.031316027728025726)}
Estimated reward: 0.087641
Cross-validated error: 0.001036
Final score CI (reward +- 2*error): [0.085568, 0.089714]
Standard error: 0.005134
Final t_dist CI (reward +- t_0.975*se_hat): [0.077578, 0.097704]
[I 2025-11-11 10:29:27,065] Trial 9 finished with value: 0.08556813471315235 and parameters: {'lr': 0.001019796043341646, 'num_epochs': 10, 'batch_size': 128, 'lr_decay': 0.9164765951228201}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 10]


Best trial: 5. Best value: 0.0899304:  55%|█████▌    | 11/20 [00:28<00:21,  2.43s/it]

actual reward: [0.09396204]
Validation weights_info: {'gini': np.float64(0.9805916734475715), 'ess': np.float64(187.59826731719457), 'max_wi': np.float64(122.59684111636778), 'min_wi': np.float64(1.7773982022280237e-08)}
Estimated reward: 0.097111
Cross-validated error: 0.004397
Final score CI (reward +- 2*error): [0.088318, 0.105904]
Standard error: 0.021451
Final t_dist CI (reward +- t_0.975*se_hat): [0.055063, 0.139159]
[I 2025-11-11 10:29:28,844] Trial 10 finished with value: 0.08831752372001578 and parameters: {'lr': 0.020015711041983023, 'num_epochs': 7, 'batch_size': 512, 'lr_decay': 0.9663012955921852}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 11]


Best trial: 5. Best value: 0.0899304:  60%|██████    | 12/20 [00:30<00:18,  2.28s/it]

actual reward: [0.09239506]
Validation weights_info: {'gini': np.float64(0.9802677223225895), 'ess': np.float64(191.92985027070276), 'max_wi': np.float64(193.00770109895103), 'min_wi': np.float64(2.5452212078618634e-08)}
Estimated reward: 0.082811
Cross-validated error: 0.004009
Final score CI (reward +- 2*error): [0.074793, 0.090829]
Standard error: 0.018173
Final t_dist CI (reward +- t_0.975*se_hat): [0.047188, 0.118434]
[I 2025-11-11 10:29:30,767] Trial 11 finished with value: 0.07479295352202267 and parameters: {'lr': 0.019151320879724546, 'num_epochs': 7, 'batch_size': 512, 'lr_decay': 0.9930438941888624}. Best is trial 5 with value: 0.08993042580543288.

[Optuna Trial 12]


Best trial: 12. Best value: 0.09656:  65%|██████▌   | 13/20 [00:31<00:15,  2.14s/it] 

actual reward: [0.09084536]
Validation weights_info: {'gini': np.float64(0.9832396080086743), 'ess': np.float64(170.3321168183514), 'max_wi': np.float64(162.3869008668016), 'min_wi': np.float64(4.038601188236052e-09)}
Estimated reward: 0.105866
Cross-validated error: 0.004653
Final score CI (reward +- 2*error): [0.096560, 0.115172]
Standard error: 0.024636
Final t_dist CI (reward +- t_0.975*se_hat): [0.057575, 0.154158]
[I 2025-11-11 10:29:32,602] Trial 12 finished with value: 0.09655997882887875 and parameters: {'lr': 0.025704203702542575, 'num_epochs': 6, 'batch_size': 512, 'lr_decay': 0.9458046041518479}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 13]


Best trial: 12. Best value: 0.09656:  70%|███████   | 14/20 [00:33<00:11,  1.91s/it]

actual reward: [0.08859671]
Validation weights_info: {'gini': np.float64(0.9904979767073587), 'ess': np.float64(78.79204394350113), 'max_wi': np.float64(626.2413639461519), 'min_wi': np.float64(6.432803300895057e-13)}
[I 2025-11-11 10:29:33,962] Trial 13 finished with value: -inf and parameters: {'lr': 0.08962868822833822, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.9389076102568089}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 14]


Best trial: 12. Best value: 0.09656:  75%|███████▌  | 15/20 [00:35<00:09,  1.85s/it]

actual reward: [0.09322606]
Validation weights_info: {'gini': np.float64(0.9812754624516987), 'ess': np.float64(174.64993135618576), 'max_wi': np.float64(163.30625058254526), 'min_wi': np.float64(6.40177315921051e-09)}
Estimated reward: 0.100162
Cross-validated error: 0.005777
Final score CI (reward +- 2*error): [0.088609, 0.111715]
Standard error: 0.025438
Final t_dist CI (reward +- t_0.975*se_hat): [0.050298, 0.150026]
[I 2025-11-11 10:29:35,674] Trial 14 finished with value: 0.08860852803225724 and parameters: {'lr': 0.024269166319333808, 'num_epochs': 6, 'batch_size': 512, 'lr_decay': 0.9459397276893118}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 15]


Best trial: 12. Best value: 0.09656:  80%|████████  | 16/20 [00:36<00:07,  1.85s/it]

actual reward: [0.08813617]
Validation weights_info: {'gini': np.float64(0.610910026409276), 'ess': np.float64(2853.9637318143514), 'max_wi': np.float64(21.481383940686598), 'min_wi': np.float64(0.020031643257421954)}
Estimated reward: 0.089172
Cross-validated error: 0.001200
Final score CI (reward +- 2*error): [0.086771, 0.091572]
Standard error: 0.005576
Final t_dist CI (reward +- t_0.975*se_hat): [0.078242, 0.100101]
[I 2025-11-11 10:29:37,526] Trial 15 finished with value: 0.08677145826267307 and parameters: {'lr': 0.002874037569771727, 'num_epochs': 5, 'batch_size': 256, 'lr_decay': 0.8768096462480616}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 16]


Best trial: 12. Best value: 0.09656:  85%|████████▌ | 17/20 [00:38<00:05,  1.71s/it]

actual reward: [0.08890858]
Validation weights_info: {'gini': np.float64(0.7627666067388036), 'ess': np.float64(1324.787154265603), 'max_wi': np.float64(46.031105518168005), 'min_wi': np.float64(0.0043132179228083855)}
Estimated reward: 0.091385
Cross-validated error: 0.001791
Final score CI (reward +- 2*error): [0.087803, 0.094968]
Standard error: 0.008042
Final t_dist CI (reward +- t_0.975*se_hat): [0.075621, 0.107149]
[I 2025-11-11 10:29:38,925] Trial 16 finished with value: 0.08780264993471995 and parameters: {'lr': 0.010340123625646838, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.8004717253058359}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 17]


Best trial: 12. Best value: 0.09656:  90%|█████████ | 18/20 [00:40<00:03,  1.81s/it]

actual reward: [0.09180273]
Validation weights_info: {'gini': np.float64(0.9855949807172962), 'ess': np.float64(152.10724588292464), 'max_wi': np.float64(223.94015976187234), 'min_wi': np.float64(1.9993265447925353e-11)}
Estimated reward: 0.087844
Cross-validated error: 0.004104
Final score CI (reward +- 2*error): [0.079637, 0.096052]
Standard error: 0.020904
Final t_dist CI (reward +- t_0.975*se_hat): [0.046869, 0.128820]
[I 2025-11-11 10:29:40,949] Trial 17 finished with value: 0.07963690734507894 and parameters: {'lr': 0.04824559386893675, 'num_epochs': 6, 'batch_size': 512, 'lr_decay': 0.9357452739750584}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 18]


Best trial: 12. Best value: 0.09656:  95%|█████████▌| 19/20 [00:41<00:01,  1.66s/it]

actual reward: [0.08729439]
Validation weights_info: {'gini': np.float64(0.327290554868338), 'ess': np.float64(6947.072369108858), 'max_wi': np.float64(4.718751770378085), 'min_wi': np.float64(0.23041763971400625)}
Estimated reward: 0.083431
Cross-validated error: 0.000711
Final score CI (reward +- 2*error): [0.082010, 0.084852]
Standard error: 0.003420
Final t_dist CI (reward +- t_0.975*se_hat): [0.076727, 0.090135]
[I 2025-11-11 10:29:42,265] Trial 18 finished with value: 0.08200956781642585 and parameters: {'lr': 0.0032869921712933817, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.8257104252548082}. Best is trial 12 with value: 0.09655997882887875.

[Optuna Trial 19]


Best trial: 12. Best value: 0.09656: 100%|██████████| 20/20 [00:43<00:00,  2.16s/it]

actual reward: [0.08654729]
Validation weights_info: {'gini': np.float64(0.031228390419143087), 'ess': np.float64(9968.459643879678), 'max_wi': np.float64(1.180269033449073), 'min_wi': np.float64(0.8801044749779101)}
Estimated reward: 0.080031
Cross-validated error: 0.000582
Final score CI (reward +- 2*error): [0.078867, 0.081195]
Standard error: 0.002725
Final t_dist CI (reward +- t_0.975*se_hat): [0.074689, 0.085372]
[I 2025-11-11 10:29:43,834] Trial 19 finished with value: 0.0788669096625849 and parameters: {'lr': 0.00015679113128144585, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8791429287464113}. Best is trial 12 with value: 0.09655997882887875.





Num samples is 10000
{'gini': np.float64(0.9979605942920191), 'ess': np.float64(9.636999826630497), 'max_wi': np.float64(2184.915497471503), 'min_wi': np.float64(7.70155293902118e-13)}
Eval time: 0.4479806423187256 seconds
Evaluation total results time: 0.7891767024993896 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08647581,0.0868,0.08666819,0.08666819,0.08670696,0.08670696,0.88083979,0.0,0.74725465,0.0
15000,0.08243663,0.01799097,0.09043504,0.09043504,-0.08585435,0.01346057,1.11791414,0.471502,0.98206524,0.51830735


In [9]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=30000)

Random Item CTR: 0.07069350185865088
Optimal greedy CTR: 0.09999918303816259
Second Best greedy CTR: 0.09988806364453348
Optimal Stochastic CTR: 0.0999509448932121
second Best Stochastic CTR: 0.0863879153702632
Our Initial CTR: 0.08653966603258505


In [10]:
# Run the optimization
df7, best_hyperparams_by_size = regression_trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df7[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.028983592987060547 seconds
Baseline regression model fit time: 0.08s
Num samples is 10000
{'gini': np.float64(0.45323370736369234), 'ess': np.float64(4680.315417494375), 'max_wi': np.float64(24.219661587126723), 'min_wi': np.float64(0.013248576676190449)}
Eval time: 0.48833703994750977 seconds


[I 2025-11-11 10:29:48,135] A new study created in memory with name: no-name-d093eab5-2b7d-4836-b240-4cb40cdb070f


Evaluation total results time: 0.9029138088226318 seconds

=== Training size 15000, run 0 ===
Simulation time for 25000 samples: 0.06693840026855469 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


[Optuna Trial 0]


Best trial: 0. Best value: -inf:   5%|▌         | 1/20 [00:03<00:58,  3.11s/it]

actual reward: [0.08419051]
Validation weights_info: {'gini': np.float64(0.9925718025830521), 'ess': np.float64(51.63218215236139), 'max_wi': np.float64(1045.7690423109068), 'min_wi': np.float64(3.8001386308020455e-43)}
[I 2025-11-11 10:29:51,236] Trial 0 finished with value: -inf and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.85}. Best is trial 0 with value: -inf.

[Optuna Trial 1]


Best trial: 1. Best value: 0.0772531:  10%|█         | 2/20 [00:05<00:44,  2.45s/it]

actual reward: [0.09058048]
Validation weights_info: {'gini': np.float64(0.9216005075061392), 'ess': np.float64(308.5179245553122), 'max_wi': np.float64(149.9862884622826), 'min_wi': np.float64(1.98756476585513e-06)}
Estimated reward: 0.082931
Cross-validated error: 0.002839
Final score CI (reward +- 2*error): [0.077253, 0.088610]
Standard error: 0.015875
Final t_dist CI (reward +- t_0.975*se_hat): [0.051814, 0.114049]
[I 2025-11-11 10:29:53,228] Trial 1 finished with value: 0.07725305203645788 and parameters: {'lr': 0.004558602334572408, 'num_epochs': 8, 'batch_size': 256, 'lr_decay': 0.9160201642946809}. Best is trial 1 with value: 0.07725305203645788.

[Optuna Trial 2]


Best trial: 2. Best value: 0.0774555:  15%|█▌        | 3/20 [00:06<00:31,  1.87s/it]

actual reward: [0.08669256]
Validation weights_info: {'gini': np.float64(0.0560125151410304), 'ess': np.float64(9890.934781120002), 'max_wi': np.float64(1.5363119886440135), 'min_wi': np.float64(0.7905773124233992)}
Estimated reward: 0.078463
Cross-validated error: 0.000504
Final score CI (reward +- 2*error): [0.077455, 0.079470]
Standard error: 0.002717
Final t_dist CI (reward +- t_0.975*se_hat): [0.073138, 0.083788]
[I 2025-11-11 10:29:54,417] Trial 2 finished with value: 0.0774554511537253 and parameters: {'lr': 0.0006211482115928619, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.8194885715057094}. Best is trial 2 with value: 0.0774554511537253.

[Optuna Trial 3]


Best trial: 2. Best value: 0.0774555:  20%|██        | 4/20 [00:08<00:32,  2.00s/it]

actual reward: [0.08997396]
Validation weights_info: {'gini': np.float64(0.8913838047970307), 'ess': np.float64(356.22812121883925), 'max_wi': np.float64(156.8819922465768), 'min_wi': np.float64(3.606224709849791e-06)}
Estimated reward: 0.080597
Cross-validated error: 0.002823
Final score CI (reward +- 2*error): [0.074951, 0.086244]
Standard error: 0.014476
Final t_dist CI (reward +- t_0.975*se_hat): [0.052221, 0.108974]
[I 2025-11-11 10:29:56,613] Trial 3 finished with value: 0.07495084214896904 and parameters: {'lr': 0.006613184711977483, 'num_epochs': 10, 'batch_size': 512, 'lr_decay': 0.8463478036799741}. Best is trial 2 with value: 0.0774554511537253.

[Optuna Trial 4]


Best trial: 4. Best value: 0.0776177:  25%|██▌       | 5/20 [00:10<00:31,  2.07s/it]

actual reward: [0.08676921]
Validation weights_info: {'gini': np.float64(0.08707920717438179), 'ess': np.float64(9730.141711340693), 'max_wi': np.float64(2.0331880836975444), 'min_wi': np.float64(0.6905658610678926)}
Estimated reward: 0.078718
Cross-validated error: 0.000550
Final score CI (reward +- 2*error): [0.077618, 0.079818]
Standard error: 0.002749
Final t_dist CI (reward +- t_0.975*se_hat): [0.073330, 0.084105]
[I 2025-11-11 10:29:58,811] Trial 4 finished with value: 0.07761766302333961 and parameters: {'lr': 0.0001681276334394482, 'num_epochs': 9, 'batch_size': 256, 'lr_decay': 0.9942397673572932}. Best is trial 4 with value: 0.07761766302333961.

[Optuna Trial 5]


Best trial: 4. Best value: 0.0776177:  30%|███       | 6/20 [00:12<00:27,  1.95s/it]

actual reward: [0.0890299]
Validation weights_info: {'gini': np.float64(0.9940771324050878), 'ess': np.float64(31.975429476911305), 'max_wi': np.float64(990.447837922644), 'min_wi': np.float64(3.753791611734111e-18)}
[I 2025-11-11 10:30:00,537] Trial 5 finished with value: -inf and parameters: {'lr': 0.04172991533101517, 'num_epochs': 2, 'batch_size': 64, 'lr_decay': 0.9807270333452571}. Best is trial 4 with value: 0.07761766302333961.

[Optuna Trial 6]


Best trial: 4. Best value: 0.0776177:  35%|███▌      | 7/20 [00:13<00:23,  1.83s/it]

actual reward: [0.08676277]
Validation weights_info: {'gini': np.float64(0.0811357580331118), 'ess': np.float64(9769.614024065275), 'max_wi': np.float64(1.8558365749717187), 'min_wi': np.float64(0.7087106459224878)}
Estimated reward: 0.078578
Cross-validated error: 0.000515
Final score CI (reward +- 2*error): [0.077548, 0.079607]
Standard error: 0.002739
Final t_dist CI (reward +- t_0.975*se_hat): [0.073208, 0.083947]
[I 2025-11-11 10:30:02,105] Trial 6 finished with value: 0.07754812814510587 and parameters: {'lr': 0.000459725766711462, 'num_epochs': 6, 'batch_size': 512, 'lr_decay': 0.9124637764747457}. Best is trial 4 with value: 0.07761766302333961.

[Optuna Trial 7]


Best trial: 4. Best value: 0.0776177:  40%|████      | 8/20 [00:14<00:18,  1.51s/it]

actual reward: [0.08847708]
Validation weights_info: {'gini': np.float64(0.9277838875037718), 'ess': np.float64(331.66688516860904), 'max_wi': np.float64(107.13570803466165), 'min_wi': np.float64(1.4219658773893283e-07)}
Estimated reward: 0.081046
Cross-validated error: 0.003308
Final score CI (reward +- 2*error): [0.074430, 0.087662]
Standard error: 0.015630
Final t_dist CI (reward +- t_0.975*se_hat): [0.050407, 0.111685]
[I 2025-11-11 10:30:02,949] Trial 7 finished with value: 0.0744295685637572 and parameters: {'lr': 0.041599793874829004, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9116058811331835}. Best is trial 4 with value: 0.07761766302333961.

[Optuna Trial 8]


Best trial: 8. Best value: 0.0833282:  45%|████▌     | 9/20 [00:17<00:19,  1.78s/it]

actual reward: [0.08797059]
Validation weights_info: {'gini': np.float64(0.5420604176565589), 'ess': np.float64(1863.2657996437936), 'max_wi': np.float64(45.61214343111103), 'min_wi': np.float64(0.050423275465960454)}
Estimated reward: 0.085439
Cross-validated error: 0.001056
Final score CI (reward +- 2*error): [0.083328, 0.087551]
Standard error: 0.004553
Final t_dist CI (reward +- t_0.975*se_hat): [0.076515, 0.094364]
[I 2025-11-11 10:30:05,328] Trial 8 finished with value: 0.0833282147324192 and parameters: {'lr': 0.0016917053475285992, 'num_epochs': 10, 'batch_size': 256, 'lr_decay': 0.8254498287540155}. Best is trial 8 with value: 0.0833282147324192.

[Optuna Trial 9]


Best trial: 8. Best value: 0.0833282:  50%|█████     | 10/20 [00:21<00:24,  2.43s/it]

actual reward: [0.08708295]
Validation weights_info: {'gini': np.float64(0.2161940965515375), 'ess': np.float64(8171.353542604778), 'max_wi': np.float64(6.342518080696763), 'min_wi': np.float64(0.37428699753598194)}
Estimated reward: 0.080399
Cross-validated error: 0.000585
Final score CI (reward +- 2*error): [0.079229, 0.081569]
Standard error: 0.003009
Final t_dist CI (reward +- t_0.975*se_hat): [0.074501, 0.086296]
[I 2025-11-11 10:30:09,197] Trial 9 finished with value: 0.07922904589723839 and parameters: {'lr': 0.0004103478302231146, 'num_epochs': 6, 'batch_size': 64, 'lr_decay': 0.8040018407591087}. Best is trial 8 with value: 0.0833282147324192.

[Optuna Trial 10]


Best trial: 10. Best value: 0.0838652:  55%|█████▌    | 11/20 [00:23<00:23,  2.57s/it]

actual reward: [0.0886586]
Validation weights_info: {'gini': np.float64(0.7207906506763863), 'ess': np.float64(1004.6016739883852), 'max_wi': np.float64(61.00870682623249), 'min_wi': np.float64(0.0042250408157143695)}
Estimated reward: 0.086790
Cross-validated error: 0.001463
Final score CI (reward +- 2*error): [0.083865, 0.089716]
Standard error: 0.006459
Final t_dist CI (reward +- t_0.975*se_hat): [0.074129, 0.099452]
[I 2025-11-11 10:30:12,087] Trial 10 finished with value: 0.08386520541987312 and parameters: {'lr': 0.0016784632204015662, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.8734436043413767}. Best is trial 10 with value: 0.08386520541987312.

[Optuna Trial 11]


Best trial: 11. Best value: 0.0852397:  60%|██████    | 12/20 [00:26<00:21,  2.69s/it]

actual reward: [0.08877316]
Validation weights_info: {'gini': np.float64(0.7486090819209666), 'ess': np.float64(765.3327018399219), 'max_wi': np.float64(73.37688482572355), 'min_wi': np.float64(0.0017381927682903074)}
Estimated reward: 0.088252
Cross-validated error: 0.001506
Final score CI (reward +- 2*error): [0.085240, 0.091265]
Standard error: 0.007252
Final t_dist CI (reward +- t_0.975*se_hat): [0.074037, 0.102468]
[I 2025-11-11 10:30:15,044] Trial 11 finished with value: 0.08523965000207892 and parameters: {'lr': 0.0018172166821971006, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.867743813910578}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 12]


Best trial: 11. Best value: 0.0852397:  65%|██████▌   | 13/20 [00:29<00:19,  2.81s/it]

actual reward: [0.088666]
Validation weights_info: {'gini': np.float64(0.7198634553748529), 'ess': np.float64(959.554355751789), 'max_wi': np.float64(63.66978546508539), 'min_wi': np.float64(0.00440401455739342)}
Estimated reward: 0.087172
Cross-validated error: 0.001564
Final score CI (reward +- 2*error): [0.084043, 0.090300]
Standard error: 0.006504
Final t_dist CI (reward +- t_0.975*se_hat): [0.074422, 0.099921]
[I 2025-11-11 10:30:18,130] Trial 12 finished with value: 0.084043451105818 and parameters: {'lr': 0.0016809968259139975, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.8750099282914389}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 13]


Best trial: 11. Best value: 0.0852397:  70%|███████   | 14/20 [00:32<00:16,  2.78s/it]

actual reward: [0.08631844]
Validation weights_info: {'gini': np.float64(0.9929412455232056), 'ess': np.float64(32.68013344110574), 'max_wi': np.float64(876.3324914103448), 'min_wi': np.float64(6.754144050275188e-11)}
[I 2025-11-11 10:30:20,844] Trial 13 finished with value: -inf and parameters: {'lr': 0.014072318908964985, 'num_epochs': 7, 'batch_size': 128, 'lr_decay': 0.882183721246566}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 14]


Best trial: 11. Best value: 0.0852397:  75%|███████▌  | 15/20 [00:34<00:12,  2.47s/it]

actual reward: [0.08810774]
Validation weights_info: {'gini': np.float64(0.596844795449346), 'ess': np.float64(1382.106805441786), 'max_wi': np.float64(55.55696727272314), 'min_wi': np.float64(0.019570942333832872)}
Estimated reward: 0.086418
Cross-validated error: 0.001129
Final score CI (reward +- 2*error): [0.084161, 0.088676]
Standard error: 0.004980
Final t_dist CI (reward +- t_0.975*se_hat): [0.076657, 0.096180]
[I 2025-11-11 10:30:22,600] Trial 14 finished with value: 0.08416073200304451 and parameters: {'lr': 0.0016583032988997288, 'num_epochs': 4, 'batch_size': 128, 'lr_decay': 0.9516116304912587}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 15]


Best trial: 11. Best value: 0.0852397:  80%|████████  | 16/20 [00:36<00:09,  2.28s/it]

actual reward: [0.08662688]
Validation weights_info: {'gini': np.float64(0.033422509611355844), 'ess': np.float64(9961.466774748027), 'max_wi': np.float64(1.326260070474725), 'min_wi': np.float64(0.8624642732084074)}
Estimated reward: 0.078178
Cross-validated error: 0.000520
Final score CI (reward +- 2*error): [0.077138, 0.079219]
Standard error: 0.002698
Final t_dist CI (reward +- t_0.975*se_hat): [0.072890, 0.083467]
[I 2025-11-11 10:30:24,448] Trial 15 finished with value: 0.07713803812698189 and parameters: {'lr': 0.00010460846803767811, 'num_epochs': 4, 'batch_size': 128, 'lr_decay': 0.9481077522073987}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 16]


Best trial: 11. Best value: 0.0852397:  85%|████████▌ | 17/20 [00:38<00:06,  2.15s/it]

actual reward: [0.09068919]
Validation weights_info: {'gini': np.float64(0.9772620966571766), 'ess': np.float64(102.12080429775047), 'max_wi': np.float64(538.5662928583943), 'min_wi': np.float64(1.2848421287985693e-07)}
Estimated reward: 0.084236
Cross-validated error: 0.004105
Final score CI (reward +- 2*error): [0.076026, 0.092446]
Standard error: 0.019134
Final t_dist CI (reward +- t_0.975*se_hat): [0.046730, 0.121742]
[I 2025-11-11 10:30:26,272] Trial 16 finished with value: 0.07602590964806169 and parameters: {'lr': 0.010207681027649327, 'num_epochs': 4, 'batch_size': 128, 'lr_decay': 0.9520071519613802}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 17]


Best trial: 11. Best value: 0.0852397:  90%|█████████ | 18/20 [00:40<00:04,  2.06s/it]

actual reward: [0.08752169]
Validation weights_info: {'gini': np.float64(0.39948222833549824), 'ess': np.float64(4159.1147958758365), 'max_wi': np.float64(22.835631378470445), 'min_wi': np.float64(0.13687686804151966)}
Estimated reward: 0.083229
Cross-validated error: 0.000751
Final score CI (reward +- 2*error): [0.081727, 0.084732]
Standard error: 0.003702
Final t_dist CI (reward +- t_0.975*se_hat): [0.075973, 0.090485]
[I 2025-11-11 10:30:28,135] Trial 17 finished with value: 0.08172651678665399 and parameters: {'lr': 0.0010392894450570063, 'num_epochs': 4, 'batch_size': 128, 'lr_decay': 0.9456627124357433}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 18]


Best trial: 11. Best value: 0.0852397:  95%|█████████▌| 19/20 [00:42<00:02,  2.25s/it]

actual reward: [0.08953177]
Validation weights_info: {'gini': np.float64(0.8650752852230232), 'ess': np.float64(481.31452610611194), 'max_wi': np.float64(104.26248517739607), 'min_wi': np.float64(9.257010937310435e-05)}
Estimated reward: 0.086864
Cross-validated error: 0.002770
Final score CI (reward +- 2*error): [0.081324, 0.092403]
Standard error: 0.013170
Final t_dist CI (reward +- t_0.975*se_hat): [0.061048, 0.112680]
[I 2025-11-11 10:30:30,811] Trial 18 finished with value: 0.0813241607405256 and parameters: {'lr': 0.002515149925647625, 'num_epochs': 7, 'batch_size': 128, 'lr_decay': 0.9289357292576451}. Best is trial 11 with value: 0.08523965000207892.

[Optuna Trial 19]


Best trial: 11. Best value: 0.0852397: 100%|██████████| 20/20 [00:44<00:00,  2.24s/it]

actual reward: [0.08685888]
Validation weights_info: {'gini': np.float64(0.12435049253445833), 'ess': np.float64(9423.6678870296), 'max_wi': np.float64(2.9472480151247105), 'min_wi': np.float64(0.5673125561419694)}
Estimated reward: 0.079177
Cross-validated error: 0.000520
Final score CI (reward +- 2*error): [0.078137, 0.080217]
Standard error: 0.002804
Final t_dist CI (reward +- t_0.975*se_hat): [0.073682, 0.084673]
[I 2025-11-11 10:30:32,974] Trial 19 finished with value: 0.07813730968012439 and parameters: {'lr': 0.00028424508058310516, 'num_epochs': 5, 'batch_size': 128, 'lr_decay': 0.9710851119565814}. Best is trial 11 with value: 0.08523965000207892.





Num samples is 10000
{'gini': np.float64(0.723724950605085), 'ess': np.float64(726.7124432023414), 'max_wi': np.float64(117.02163226947198), 'min_wi': np.float64(0.0009309712030512397)}
Eval time: 0.4665825366973877 seconds
Evaluation total results time: 0.8202934265136719 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08653967,0.0848,0.08462449,0.08462449,0.08463264,0.08463264,0.82469903,0.0,0.72168239,0.0
15000,0.08836419,0.08424836,0.08817228,0.08817228,0.08447442,0.08455173,0.85587533,0.14924689,0.72966759,0.0801863


In [11]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=40000)

Random Item CTR: 0.07053370144999074
Optimal greedy CTR: 0.09999936716169436
Second Best greedy CTR: 0.09676800930842865
Optimal Stochastic CTR: 0.09995563088920843
second Best Stochastic CTR: 0.08606322612964991
Our Initial CTR: 0.08622184481781218


In [12]:
# Run the optimization
df8, best_hyperparams_by_size = regression_trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df8[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.02944326400756836 seconds
Baseline regression model fit time: 0.08s
Num samples is 10000
{'gini': np.float64(0.47119836127671866), 'ess': np.float64(4165.36683113202), 'max_wi': np.float64(40.85082174315029), 'min_wi': np.float64(0.019546510541469567)}
Eval time: 0.4900679588317871 seconds


[I 2025-11-11 10:30:37,174] A new study created in memory with name: no-name-7eb2b8a7-0069-4e3a-a0d7-fee902c5cef4


Evaluation total results time: 0.9823775291442871 seconds

=== Training size 15000, run 0 ===
Simulation time for 25000 samples: 0.07058906555175781 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


[Optuna Trial 0]


Best trial: 0. Best value: -inf:   5%|▌         | 1/20 [00:03<01:08,  3.62s/it]

actual reward: [0.07944448]
Validation weights_info: {'gini': np.float64(0.9934530237740461), 'ess': np.float64(35.55729769063597), 'max_wi': np.float64(1168.984027163312), 'min_wi': np.float64(0.0)}
[I 2025-11-11 10:30:40,797] Trial 0 finished with value: -inf and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.85}. Best is trial 0 with value: -inf.

[Optuna Trial 1]


Best trial: 0. Best value: -inf:  10%|█         | 2/20 [00:06<00:56,  3.12s/it]

actual reward: [0.08819419]
Validation weights_info: {'gini': np.float64(0.978227980001397), 'ess': np.float64(42.91509331035168), 'max_wi': np.float64(851.2561366386415), 'min_wi': np.float64(2.455761053326662e-06)}
[I 2025-11-11 10:30:43,557] Trial 1 finished with value: -inf and parameters: {'lr': 0.0062464436952037765, 'num_epochs': 10, 'batch_size': 256, 'lr_decay': 0.8817036537408403}. Best is trial 0 with value: -inf.

[Optuna Trial 2]


Best trial: 0. Best value: -inf:  15%|█▌        | 3/20 [00:10<01:02,  3.67s/it]

actual reward: [0.08977655]
Validation weights_info: {'gini': np.float64(0.9938520095638116), 'ess': np.float64(17.55834323910588), 'max_wi': np.float64(1248.6948333808846), 'min_wi': np.float64(8.051661485352563e-13)}
[I 2025-11-11 10:30:47,886] Trial 2 finished with value: -inf and parameters: {'lr': 0.01712228969107053, 'num_epochs': 6, 'batch_size': 64, 'lr_decay': 0.8361354716760131}. Best is trial 0 with value: -inf.

[Optuna Trial 3]


Best trial: 0. Best value: -inf:  20%|██        | 4/20 [00:16<01:14,  4.67s/it]

actual reward: [0.09135246]
Validation weights_info: {'gini': np.float64(0.9922315541183605), 'ess': np.float64(14.687052499925871), 'max_wi': np.float64(1571.731482025802), 'min_wi': np.float64(4.112161790637052e-14)}
[I 2025-11-11 10:30:54,086] Trial 3 finished with value: -inf and parameters: {'lr': 0.019989676230393875, 'num_epochs': 10, 'batch_size': 64, 'lr_decay': 0.8034567841807015}. Best is trial 0 with value: -inf.

[Optuna Trial 4]


Best trial: 0. Best value: -inf:  25%|██▌       | 5/20 [00:18<00:54,  3.65s/it]

actual reward: [0.08518189]
Validation weights_info: {'gini': np.float64(0.9962972708621654), 'ess': np.float64(15.39736368606405), 'max_wi': np.float64(997.2429002372841), 'min_wi': np.float64(1.0410315047458626e-10)}
[I 2025-11-11 10:30:55,942] Trial 4 finished with value: -inf and parameters: {'lr': 0.02531870600973349, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.87886554314764}. Best is trial 0 with value: -inf.

[Optuna Trial 5]


Best trial: 5. Best value: 0.0943208:  30%|███       | 6/20 [00:22<00:52,  3.74s/it]

actual reward: [0.08858154]
Validation weights_info: {'gini': np.float64(0.949619554964592), 'ess': np.float64(247.58341725737745), 'max_wi': np.float64(236.41843572660324), 'min_wi': np.float64(1.4501332724462716e-05)}
Estimated reward: 0.104147
Cross-validated error: 0.004913
Final score CI (reward +- 2*error): [0.094321, 0.113972]
Standard error: 0.025562
Final t_dist CI (reward +- t_0.975*se_hat): [0.054041, 0.154252]
[I 2025-11-11 10:30:59,855] Trial 5 finished with value: 0.09432077479434733 and parameters: {'lr': 0.0035534883051237345, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.8601005309058071}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 6]


Best trial: 5. Best value: 0.0943208:  35%|███▌      | 7/20 [00:24<00:38,  2.97s/it]

actual reward: [0.08703048]
Validation weights_info: {'gini': np.float64(0.6236678799574213), 'ess': np.float64(2878.2856511793034), 'max_wi': np.float64(13.901796031310148), 'min_wi': np.float64(0.021651173326996252)}
Estimated reward: 0.080785
Cross-validated error: 0.000945
Final score CI (reward +- 2*error): [0.078894, 0.082676]
Standard error: 0.005229
Final t_dist CI (reward +- t_0.975*se_hat): [0.070535, 0.091034]
[I 2025-11-11 10:31:01,239] Trial 6 finished with value: 0.07889380628534583 and parameters: {'lr': 0.011789817594174261, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9204332770210968}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 7]


Best trial: 5. Best value: 0.0943208:  40%|████      | 8/20 [00:25<00:29,  2.42s/it]

actual reward: [0.08643603]
Validation weights_info: {'gini': np.float64(0.8496037550016757), 'ess': np.float64(978.6219434992175), 'max_wi': np.float64(37.86896687949601), 'min_wi': np.float64(0.0009296080295231808)}
Estimated reward: 0.084832
Cross-validated error: 0.002223
Final score CI (reward +- 2*error): [0.080385, 0.089279]
Standard error: 0.009300
Final t_dist CI (reward +- t_0.975*se_hat): [0.066603, 0.103061]
[I 2025-11-11 10:31:02,483] Trial 7 finished with value: 0.0803847689105423 and parameters: {'lr': 0.021174493389147828, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9161912578229731}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 8]


Best trial: 5. Best value: 0.0943208:  45%|████▌     | 9/20 [00:27<00:26,  2.43s/it]

actual reward: [0.08946025]
Validation weights_info: {'gini': np.float64(0.993800764015656), 'ess': np.float64(13.897791868174762), 'max_wi': np.float64(1732.1391328991208), 'min_wi': np.float64(1.4607772128151566e-14)}
[I 2025-11-11 10:31:04,938] Trial 8 finished with value: -inf and parameters: {'lr': 0.029478186245442652, 'num_epochs': 5, 'batch_size': 128, 'lr_decay': 0.8677956328225468}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 9]


Best trial: 5. Best value: 0.0943208:  50%|█████     | 10/20 [00:29<00:22,  2.26s/it]

actual reward: [0.08633557]
Validation weights_info: {'gini': np.float64(0.09026867093486801), 'ess': np.float64(9723.872323919251), 'max_wi': np.float64(1.5795715557535284), 'min_wi': np.float64(0.6672059360438624)}
Estimated reward: 0.077201
Cross-validated error: 0.000537
Final score CI (reward +- 2*error): [0.076128, 0.078275]
Standard error: 0.002707
Final t_dist CI (reward +- t_0.975*se_hat): [0.071895, 0.082508]
[I 2025-11-11 10:31:06,822] Trial 9 finished with value: 0.07612820650472346 and parameters: {'lr': 0.000316411072896943, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8810867089940808}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 10]


Best trial: 5. Best value: 0.0943208:  55%|█████▌    | 11/20 [00:33<00:23,  2.64s/it]

actual reward: [0.0870702]
Validation weights_info: {'gini': np.float64(0.8178046638072115), 'ess': np.float64(1145.7292877101067), 'max_wi': np.float64(33.48240950185583), 'min_wi': np.float64(0.0008441867391173484)}
Estimated reward: 0.082378
Cross-validated error: 0.001486
Final score CI (reward +- 2*error): [0.079405, 0.085350]
Standard error: 0.008190
Final t_dist CI (reward +- t_0.975*se_hat): [0.066324, 0.098431]
[I 2025-11-11 10:31:10,316] Trial 10 finished with value: 0.07940513028696293 and parameters: {'lr': 0.0011157775461761632, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.9827802961480635}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 11]


Best trial: 5. Best value: 0.0943208:  60%|██████    | 12/20 [00:34<00:17,  2.20s/it]

actual reward: [0.0865313]
Validation weights_info: {'gini': np.float64(0.23203358248870765), 'ess': np.float64(8258.831160274147), 'max_wi': np.float64(3.174889516021036), 'min_wi': np.float64(0.32124494516458985)}
Estimated reward: 0.077611
Cross-validated error: 0.000557
Final score CI (reward +- 2*error): [0.076498, 0.078724]
Standard error: 0.002948
Final t_dist CI (reward +- t_0.975*se_hat): [0.071831, 0.083391]
[I 2025-11-11 10:31:11,526] Trial 11 finished with value: 0.076497525895251 and parameters: {'lr': 0.002159291672720129, 'num_epochs': 2, 'batch_size': 512, 'lr_decay': 0.9329070461718538}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 12]


Best trial: 5. Best value: 0.0943208:  65%|██████▌   | 13/20 [00:36<00:15,  2.15s/it]

actual reward: [0.08662817]
Validation weights_info: {'gini': np.float64(0.2790362047354602), 'ess': np.float64(7555.245691227479), 'max_wi': np.float64(3.959906894898514), 'min_wi': np.float64(0.25294699562661255)}
Estimated reward: 0.077654
Cross-validated error: 0.000635
Final score CI (reward +- 2*error): [0.076384, 0.078925]
Standard error: 0.003087
Final t_dist CI (reward +- t_0.975*se_hat): [0.071603, 0.083706]
[I 2025-11-11 10:31:13,561] Trial 12 finished with value: 0.07638404993913367 and parameters: {'lr': 0.0008273822813649414, 'num_epochs': 7, 'batch_size': 512, 'lr_decay': 0.9548029035984433}. Best is trial 5 with value: 0.09432077479434733.

[Optuna Trial 13]


Best trial: 13. Best value: 0.110518:  70%|███████   | 14/20 [00:39<00:15,  2.53s/it]

actual reward: [0.08870486]
Validation weights_info: {'gini': np.float64(0.9704181511957849), 'ess': np.float64(100.70012641368922), 'max_wi': np.float64(389.90406286012103), 'min_wi': np.float64(1.5785158595182832e-06)}
Estimated reward: 0.129133
Cross-validated error: 0.009308
Final score CI (reward +- 2*error): [0.110518, 0.147749]
Standard error: 0.044771
Final t_dist CI (reward +- t_0.975*se_hat): [0.041374, 0.216893]
[I 2025-11-11 10:31:16,954] Trial 13 finished with value: 0.11051809170620389 and parameters: {'lr': 0.004078504342863419, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.9138963577294428}. Best is trial 13 with value: 0.11051809170620389.

[Optuna Trial 14]


Best trial: 13. Best value: 0.110518:  75%|███████▌  | 15/20 [00:43<00:14,  2.83s/it]

actual reward: [0.08810752]
Validation weights_info: {'gini': np.float64(0.9392085728310939), 'ess': np.float64(407.6713714453453), 'max_wi': np.float64(154.3850611397843), 'min_wi': np.float64(2.4893646316873874e-05)}
Estimated reward: 0.097621
Cross-validated error: 0.003345
Final score CI (reward +- 2*error): [0.090931, 0.104310]
Standard error: 0.018140
Final t_dist CI (reward +- t_0.975*se_hat): [0.062064, 0.133178]
[I 2025-11-11 10:31:20,491] Trial 14 finished with value: 0.09093103988123494 and parameters: {'lr': 0.0038744311884660932, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.8221651618946493}. Best is trial 13 with value: 0.11051809170620389.

[Optuna Trial 15]


Best trial: 13. Best value: 0.110518:  80%|████████  | 16/20 [00:47<00:12,  3.10s/it]

actual reward: [0.08643751]
Validation weights_info: {'gini': np.float64(0.17160613419083873), 'ess': np.float64(9004.336510308054), 'max_wi': np.float64(2.4552663744466794), 'min_wi': np.float64(0.4274627724330676)}
Estimated reward: 0.077384
Cross-validated error: 0.000574
Final score CI (reward +- 2*error): [0.076236, 0.078531]
Standard error: 0.002820
Final t_dist CI (reward +- t_0.975*se_hat): [0.071856, 0.082911]
[I 2025-11-11 10:31:24,207] Trial 15 finished with value: 0.07623560673399475 and parameters: {'lr': 0.00017184786036352672, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.9513280268652021}. Best is trial 13 with value: 0.11051809170620389.

[Optuna Trial 16]


Best trial: 13. Best value: 0.110518:  85%|████████▌ | 17/20 [00:50<00:09,  3.18s/it]

actual reward: [0.0874785]
Validation weights_info: {'gini': np.float64(0.8611873603329923), 'ess': np.float64(931.097335900953), 'max_wi': np.float64(45.45472970044795), 'min_wi': np.float64(0.00033906556107712166)}
Estimated reward: 0.083077
Cross-validated error: 0.001773
Final score CI (reward +- 2*error): [0.079532, 0.086622]
Standard error: 0.009180
Final t_dist CI (reward +- t_0.975*se_hat): [0.065082, 0.101072]
[I 2025-11-11 10:31:27,572] Trial 16 finished with value: 0.07953215266535746 and parameters: {'lr': 0.0013205626434491388, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.9938869571656733}. Best is trial 13 with value: 0.11051809170620389.

[Optuna Trial 17]


Best trial: 13. Best value: 0.110518:  90%|█████████ | 18/20 [00:53<00:06,  3.09s/it]

actual reward: [0.08915388]
Validation weights_info: {'gini': np.float64(0.9858648990309767), 'ess': np.float64(30.214953421468486), 'max_wi': np.float64(1095.7046413355317), 'min_wi': np.float64(6.592694354002156e-08)}
[I 2025-11-11 10:31:30,458] Trial 17 finished with value: -inf and parameters: {'lr': 0.00586265847954879, 'num_epochs': 7, 'batch_size': 128, 'lr_decay': 0.9003123013999557}. Best is trial 13 with value: 0.11051809170620389.

[Optuna Trial 18]


Best trial: 13. Best value: 0.110518:  95%|█████████▌| 19/20 [00:57<00:03,  3.31s/it]

actual reward: [0.08666601]
Validation weights_info: {'gini': np.float64(0.33252873702086577), 'ess': np.float64(6673.739846113044), 'max_wi': np.float64(5.5653106228421185), 'min_wi': np.float64(0.15083686539793106)}
Estimated reward: 0.077790
Cross-validated error: 0.000659
Final score CI (reward +- 2*error): [0.076472, 0.079107]
Standard error: 0.003295
Final t_dist CI (reward +- t_0.975*se_hat): [0.071330, 0.084249]
[I 2025-11-11 10:31:34,273] Trial 18 finished with value: 0.07647205824449332 and parameters: {'lr': 0.0004813900519260887, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.8484119835905123}. Best is trial 13 with value: 0.11051809170620389.

[Optuna Trial 19]


Best trial: 13. Best value: 0.110518: 100%|██████████| 20/20 [01:00<00:00,  3.04s/it]

actual reward: [0.08839967]
Validation weights_info: {'gini': np.float64(0.949712818480312), 'ess': np.float64(251.6621075141464), 'max_wi': np.float64(206.88639835856182), 'min_wi': np.float64(1.1869990381547652e-05)}
Estimated reward: 0.082390
Cross-validated error: 0.003146
Final score CI (reward +- 2*error): [0.076098, 0.088682]
Standard error: 0.016117
Final t_dist CI (reward +- t_0.975*se_hat): [0.050797, 0.113984]
[I 2025-11-11 10:31:38,021] Trial 19 finished with value: 0.07609839356887955 and parameters: {'lr': 0.002977743140855752, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.9014678168811737}. Best is trial 13 with value: 0.11051809170620389.





Num samples is 10000
{'gini': np.float64(0.9641694935396395), 'ess': np.float64(226.62809556869334), 'max_wi': np.float64(144.47212655943005), 'min_wi': np.float64(2.122378996258739e-06)}
Eval time: 0.46201038360595703 seconds
Evaluation total results time: 0.8237674236297607 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08622184,0.0858,0.08617562,0.08617562,0.08613966,0.08613966,0.92210476,0.0,0.83772226,0.0
15000,0.08811177,0.10101644,0.09033655,0.09033655,0.10202656,0.10177311,1.02910662,0.30136516,0.92250311,0.20792459


In [13]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params, seed=50000)

Random Item CTR: 0.0705882181025533
Optimal greedy CTR: 0.09999934164533562
Second Best greedy CTR: 0.09924496289352924
Optimal Stochastic CTR: 0.09995498601895662
second Best Stochastic CTR: 0.08629000824986369
Our Initial CTR: 0.08647501952799874


In [14]:
# Run the optimization
df9, best_hyperparams_by_size = regression_trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Show the performance metrics
df9[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Simulation time for 10000 samples: 0.0310211181640625 seconds


Baseline regression model fit time: 0.11s
Num samples is 10000
{'gini': np.float64(0.4694748457652906), 'ess': np.float64(4553.195321326933), 'max_wi': np.float64(18.43949488698513), 'min_wi': np.float64(0.016656720105420606)}
Eval time: 0.4834873676300049 seconds


[I 2025-11-11 10:31:42,471] A new study created in memory with name: no-name-eec545fb-27a0-4b41-b64b-59f73db903bf


Evaluation total results time: 0.8646128177642822 seconds

=== Training size 15000, run 0 ===
Simulation time for 25000 samples: 0.06872344017028809 seconds


  0%|          | 0/20 [00:00<?, ?it/s]


[Optuna Trial 0]


Best trial: 0. Best value: 0.1139:   5%|▌         | 1/20 [00:03<01:05,  3.44s/it]

actual reward: [0.08320693]
Validation weights_info: {'gini': np.float64(0.984233963195753), 'ess': np.float64(170.92328995797394), 'max_wi': np.float64(173.88425295146473), 'min_wi': np.float64(0.0)}
Estimated reward: 0.122634
Cross-validated error: 0.004367
Final score CI (reward +- 2*error): [0.113900, 0.131368]
Standard error: 0.022614
Final t_dist CI (reward +- t_0.975*se_hat): [0.078305, 0.166963]
[I 2025-11-11 10:31:45,906] Trial 0 finished with value: 0.11389986729064969 and parameters: {'lr': 0.096, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.85}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 1]


Best trial: 0. Best value: 0.1139:  10%|█         | 2/20 [00:07<01:04,  3.61s/it]

actual reward: [0.09159597]
Validation weights_info: {'gini': np.float64(0.9018287728908086), 'ess': np.float64(626.1247175943029), 'max_wi': np.float64(76.9153175259472), 'min_wi': np.float64(3.629934094828682e-05)}
Estimated reward: 0.088278
Cross-validated error: 0.002010
Final score CI (reward +- 2*error): [0.084258, 0.092298]
Standard error: 0.010408
Final t_dist CI (reward +- t_0.975*se_hat): [0.067876, 0.108681]
[I 2025-11-11 10:31:49,636] Trial 1 finished with value: 0.08425840455398348 and parameters: {'lr': 0.0030129847004406463, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.8194888265543605}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 2]


Best trial: 0. Best value: 0.1139:  15%|█▌        | 3/20 [00:12<01:14,  4.35s/it]

actual reward: [0.09113609]
Validation weights_info: {'gini': np.float64(0.9862101904193598), 'ess': np.float64(8.21987367692491), 'max_wi': np.float64(4600.360157946019), 'min_wi': np.float64(2.926074135337841e-09)}
[I 2025-11-11 10:31:54,873] Trial 2 finished with value: -inf and parameters: {'lr': 0.004738191216020653, 'num_epochs': 8, 'batch_size': 64, 'lr_decay': 0.8669830803030545}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 3]


Best trial: 0. Best value: 0.1139:  20%|██        | 4/20 [00:15<00:59,  3.72s/it]

actual reward: [0.08698921]
Validation weights_info: {'gini': np.float64(0.2943797495581262), 'ess': np.float64(7283.7579321648045), 'max_wi': np.float64(4.776901796463893), 'min_wi': np.float64(0.28358471996029083)}
Estimated reward: 0.076894
Cross-validated error: 0.000617
Final score CI (reward +- 2*error): [0.075661, 0.078127]
Standard error: 0.003114
Final t_dist CI (reward +- t_0.975*se_hat): [0.070789, 0.082999]
[I 2025-11-11 10:31:57,613] Trial 3 finished with value: 0.07566079914600943 and parameters: {'lr': 0.00046856914413396047, 'num_epochs': 6, 'batch_size': 128, 'lr_decay': 0.8871104288390106}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 4]


Best trial: 0. Best value: 0.1139:  25%|██▌       | 5/20 [00:17<00:51,  3.40s/it]

actual reward: [0.08788234]
Validation weights_info: {'gini': np.float64(0.9956687057529318), 'ess': np.float64(12.806688234344414), 'max_wi': np.float64(1287.2455870093484), 'min_wi': np.float64(1.0448487282697969e-10)}
[I 2025-11-11 10:32:00,463] Trial 4 finished with value: -inf and parameters: {'lr': 0.013635626369095063, 'num_epochs': 6, 'batch_size': 128, 'lr_decay': 0.8042135747173473}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 5]


Best trial: 0. Best value: 0.1139:  30%|███       | 6/20 [00:21<00:49,  3.51s/it]

actual reward: [0.09045669]
Validation weights_info: {'gini': np.float64(0.9886596233307163), 'ess': np.float64(52.22188559794867), 'max_wi': np.float64(434.61085269146866), 'min_wi': np.float64(1.2626117960696396e-10)}
[I 2025-11-11 10:32:04,167] Trial 5 finished with value: -inf and parameters: {'lr': 0.010336544135441638, 'num_epochs': 5, 'batch_size': 64, 'lr_decay': 0.8461828328063161}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 6]


Best trial: 0. Best value: 0.1139:  35%|███▌      | 7/20 [00:23<00:37,  2.86s/it]

actual reward: [0.08665372]
Validation weights_info: {'gini': np.float64(0.12176080308508591), 'ess': np.float64(9483.912029126102), 'max_wi': np.float64(1.9342124365644675), 'min_wi': np.float64(0.6322334470430885)}
Estimated reward: 0.076722
Cross-validated error: 0.000597
Final score CI (reward +- 2*error): [0.075529, 0.077915]
Standard error: 0.002727
Final t_dist CI (reward +- t_0.975*se_hat): [0.071377, 0.082067]
[I 2025-11-11 10:32:05,694] Trial 6 finished with value: 0.07552879453348715 and parameters: {'lr': 0.0008822630636108165, 'num_epochs': 3, 'batch_size': 512, 'lr_decay': 0.817972360468425}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 7]


Best trial: 0. Best value: 0.1139:  40%|████      | 8/20 [00:25<00:31,  2.59s/it]

actual reward: [0.08204418]
Validation weights_info: {'gini': np.float64(0.9920833996310024), 'ess': np.float64(24.086388168737265), 'max_wi': np.float64(1015.8935024710037), 'min_wi': np.float64(2.602961667824698e-22)}
[I 2025-11-11 10:32:07,703] Trial 7 finished with value: -inf and parameters: {'lr': 0.04572312532505407, 'num_epochs': 2, 'batch_size': 64, 'lr_decay': 0.8321498905472368}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 8]


Best trial: 0. Best value: 0.1139:  45%|████▌     | 9/20 [00:28<00:31,  2.90s/it]

actual reward: [0.08874497]
Validation weights_info: {'gini': np.float64(0.9957310191358911), 'ess': np.float64(8.555619292213713), 'max_wi': np.float64(2376.5322405308216), 'min_wi': np.float64(2.1340082183082628e-17)}
[I 2025-11-11 10:32:11,283] Trial 8 finished with value: -inf and parameters: {'lr': 0.03584369105581143, 'num_epochs': 9, 'batch_size': 128, 'lr_decay': 0.8909102033373472}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 9]


Best trial: 0. Best value: 0.1139:  50%|█████     | 10/20 [00:35<00:40,  4.00s/it]

actual reward: [0.08717362]
Validation weights_info: {'gini': np.float64(0.36607200200150936), 'ess': np.float64(6179.355823897665), 'max_wi': np.float64(6.4512030090444386), 'min_wi': np.float64(0.19054608133980205)}
Estimated reward: 0.076945
Cross-validated error: 0.000671
Final score CI (reward +- 2*error): [0.075602, 0.078287]
Standard error: 0.003372
Final t_dist CI (reward +- t_0.975*se_hat): [0.070334, 0.083555]
[I 2025-11-11 10:32:17,753] Trial 9 finished with value: 0.07560174601891226 and parameters: {'lr': 0.00031818610548362106, 'num_epochs': 10, 'batch_size': 64, 'lr_decay': 0.8739269298679643}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 10]


Best trial: 0. Best value: 0.1139:  55%|█████▌    | 11/20 [00:37<00:30,  3.37s/it]

actual reward: [0.08295161]
Validation weights_info: {'gini': np.float64(0.9992883783483862), 'ess': np.float64(5.459541884335338), 'max_wi': np.float64(4298.296195448362), 'min_wi': np.float64(6.294432255738569e-23)}
[I 2025-11-11 10:32:19,692] Trial 10 finished with value: -inf and parameters: {'lr': 0.08997748945971007, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.9492954365830494}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 11]


Best trial: 0. Best value: 0.1139:  60%|██████    | 12/20 [00:39<00:25,  3.13s/it]

actual reward: [0.08787376]
Validation weights_info: {'gini': np.float64(0.5607643847159905), 'ess': np.float64(3206.618278670093), 'max_wi': np.float64(16.85189416834788), 'min_wi': np.float64(0.043667227165566015)}
Estimated reward: 0.077666
Cross-validated error: 0.000878
Final score CI (reward +- 2*error): [0.075911, 0.079422]
Standard error: 0.004557
Final t_dist CI (reward +- t_0.975*se_hat): [0.068734, 0.086599]
[I 2025-11-11 10:32:22,291] Trial 11 finished with value: 0.07591077666933994 and parameters: {'lr': 0.0011497427425447098, 'num_epochs': 7, 'batch_size': 256, 'lr_decay': 0.9389050857174588}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 12]


Best trial: 0. Best value: 0.1139:  65%|██████▌   | 13/20 [00:41<00:17,  2.56s/it]

actual reward: [0.08669478]
Validation weights_info: {'gini': np.float64(0.14519376773069972), 'ess': np.float64(9278.59965628441), 'max_wi': np.float64(2.1754631887797817), 'min_wi': np.float64(0.5595633249149923)}
Estimated reward: 0.076745
Cross-validated error: 0.000527
Final score CI (reward +- 2*error): [0.075691, 0.077798]
Standard error: 0.002756
Final t_dist CI (reward +- t_0.975*se_hat): [0.071342, 0.082147]
[I 2025-11-11 10:32:23,514] Trial 12 finished with value: 0.07569123380277794 and parameters: {'lr': 0.0025845363639249437, 'num_epochs': 1, 'batch_size': 512, 'lr_decay': 0.9865494971813749}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 13]


Best trial: 0. Best value: 0.1139:  70%|███████   | 14/20 [00:44<00:17,  2.86s/it]

actual reward: [0.09189348]
Validation weights_info: {'gini': np.float64(0.9321147142364058), 'ess': np.float64(337.1635422062923), 'max_wi': np.float64(187.08908419805854), 'min_wi': np.float64(7.710135247152732e-06)}
Estimated reward: 0.088665
Cross-validated error: 0.002500
Final score CI (reward +- 2*error): [0.083665, 0.093664]
Standard error: 0.012972
Final t_dist CI (reward +- t_0.975*se_hat): [0.063237, 0.114093]
[I 2025-11-11 10:32:27,076] Trial 13 finished with value: 0.08366518939953344 and parameters: {'lr': 0.0037331320559683737, 'num_epochs': 8, 'batch_size': 128, 'lr_decay': 0.8413786344295155}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 14]


Best trial: 0. Best value: 0.1139:  75%|███████▌  | 15/20 [00:47<00:14,  3.00s/it]

actual reward: [0.08658954]
Validation weights_info: {'gini': np.float64(0.08388686678805675), 'ess': np.float64(9756.93613267183), 'max_wi': np.float64(1.5817676543006398), 'min_wi': np.float64(0.7060975352298039)}
Estimated reward: 0.076847
Cross-validated error: 0.000554
Final score CI (reward +- 2*error): [0.075739, 0.077954]
Standard error: 0.002693
Final t_dist CI (reward +- t_0.975*se_hat): [0.071569, 0.082125]
[I 2025-11-11 10:32:30,389] Trial 14 finished with value: 0.07573920049251744 and parameters: {'lr': 0.0001198403988529806, 'num_epochs': 4, 'batch_size': 64, 'lr_decay': 0.9226878082677995}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 15]


Best trial: 0. Best value: 0.1139:  80%|████████  | 16/20 [00:52<00:13,  3.34s/it]

actual reward: [0.09080374]
Validation weights_info: {'gini': np.float64(0.9930016529051676), 'ess': np.float64(22.36244021367819), 'max_wi': np.float64(1186.1443891258298), 'min_wi': np.float64(4.1611235822333075e-12)}
[I 2025-11-11 10:32:34,542] Trial 15 finished with value: -inf and parameters: {'lr': 0.012712536791722488, 'num_epochs': 10, 'batch_size': 128, 'lr_decay': 0.8594380182225028}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 16]


Best trial: 0. Best value: 0.1139:  85%|████████▌ | 17/20 [00:54<00:08,  2.99s/it]

actual reward: [0.0875561]
Validation weights_info: {'gini': np.float64(0.4744817157357833), 'ess': np.float64(4462.247995748727), 'max_wi': np.float64(11.889720335496246), 'min_wi': np.float64(0.10000829512755732)}
Estimated reward: 0.077263
Cross-validated error: 0.000785
Final score CI (reward +- 2*error): [0.075692, 0.078833]
Standard error: 0.003920
Final t_dist CI (reward +- t_0.975*se_hat): [0.069578, 0.084947]
[I 2025-11-11 10:32:36,701] Trial 16 finished with value: 0.07569215584243662 and parameters: {'lr': 0.002205701049967606, 'num_epochs': 7, 'batch_size': 512, 'lr_decay': 0.8016306836630378}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 17]


Best trial: 0. Best value: 0.1139:  90%|█████████ | 18/20 [00:56<00:05,  2.65s/it]

actual reward: [0.09161103]
Validation weights_info: {'gini': np.float64(0.9348961268883613), 'ess': np.float64(417.9607754306153), 'max_wi': np.float64(98.6259823088999), 'min_wi': np.float64(8.114028150573688e-06)}
Estimated reward: 0.097915
Cross-validated error: 0.002759
Final score CI (reward +- 2*error): [0.092397, 0.103432]
Standard error: 0.013633
Final t_dist CI (reward +- t_0.975*se_hat): [0.071191, 0.124638]
[I 2025-11-11 10:32:38,549] Trial 17 finished with value: 0.09239717341944872 and parameters: {'lr': 0.007679041348479698, 'num_epochs': 5, 'batch_size': 256, 'lr_decay': 0.8260756203896225}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 18]


Best trial: 0. Best value: 0.1139:  95%|█████████▌| 19/20 [00:57<00:02,  2.41s/it]

actual reward: [0.08898348]
Validation weights_info: {'gini': np.float64(0.996314722539082), 'ess': np.float64(10.648777103211053), 'max_wi': np.float64(1787.7501499561372), 'min_wi': np.float64(3.3759918939095824e-13)}
[I 2025-11-11 10:32:40,414] Trial 18 finished with value: -inf and parameters: {'lr': 0.032936655330993256, 'num_epochs': 5, 'batch_size': 256, 'lr_decay': 0.9156163633508768}. Best is trial 0 with value: 0.11389986729064969.

[Optuna Trial 19]


Best trial: 0. Best value: 0.1139: 100%|██████████| 20/20 [00:59<00:00,  2.99s/it]

actual reward: [0.08486818]
Validation weights_info: {'gini': np.float64(0.9982128836412166), 'ess': np.float64(2.5692013327200156), 'max_wi': np.float64(4688.311039546935), 'min_wi': np.float64(6.586575415780199e-22)}
[I 2025-11-11 10:32:42,256] Trial 19 finished with value: -inf and parameters: {'lr': 0.07437648796204573, 'num_epochs': 4, 'batch_size': 256, 'lr_decay': 0.8535204741975259}. Best is trial 0 with value: 0.11389986729064969.





Num samples is 10000
{'gini': np.float64(0.9960238793664443), 'ess': np.float64(23.54367070562055), 'max_wi': np.float64(1013.0067530253302), 'min_wi': np.float64(1.9573657504358787e-24)}
Eval time: 0.4697418212890625 seconds
Evaluation total results time: 0.8318989276885986 seconds


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08647502,0.0879,0.08790104,0.08790104,0.08791853,0.08791853,0.80232812,0.0,0.84032376,0.0
15000,0.08506501,0.03644018,0.08638608,0.08638608,0.04285527,0.03456955,1.105614,0.71816543,1.16561829,0.67443586


In [15]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0863,0.08627646,0.08627646,0.08629749,0.08629749,0.7569287,0.0,0.87627132,0.0
15000,0.08934609,0.10289728,0.08830008,0.08830008,0.10207247,0.10220177,0.83985308,0.23912189,0.92899388,0.15493227


### Poicy Via argmax(r_hat - error_hat) through cross validation

In [16]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0863,0.08627646,0.08627646,0.08629749,0.08629749,0.7569287,0.0,0.87627132,0.0
15000,0.08934609,0.10289728,0.08830008,0.08830008,0.10207247,0.10220177,0.83985308,0.23912189,0.92899388,0.15493227


### Policy Via using actual policy value

In [17]:
# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.0863,0.08627646,0.08627646,0.08629749,0.08629749,0.7569287,0.0,0.87627132,0.0
15000,0.08934609,0.10289728,0.08830008,0.08830008,0.10207247,0.10220177,0.83985308,0.23912189,0.92899388,0.15493227
