In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import sys

sys.path.append("/code")

from tqdm import tqdm
import torch
# device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# import gym
# import recogym

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

torch.backends.cudnn.benchmark = torch.cuda.is_available()
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")  # TF32 = big speedup on Ada


from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
import optuna
# from memory_profiler import profile


from estimators import (
    DirectMethod as DM
)

from simulation_utils import (
    eval_policy,
    generate_dataset,
    create_simulation_data_from_pi,
    get_train_data,
    get_opl_results_dict,
    CustomCFDataset,
    calc_reward,
    get_weights_info
)

from models import (    
    LinearCFModel,
    NeighborhoodModel,
    BPRModel, 
    RegressionModel
)

from training_utils import (
    train,
    validation_loop, 
    cv_score_model
 )

from custom_losses import (
    SNDRPolicyLoss
    )

random_state=12345
random_ = check_random_state(random_state)

pd.options.display.float_format = '{:,.8f}'.format

Using device: cpu
Using device: cpu
Using device: cpu


In [2]:
def get_trial_results(
    our_x, 
    our_a, 
    emb_x, 
    emb_a, 
    original_x, 
    original_a, 
    dataset, 
    val_data, 
    original_policy_prob, 
    neighberhoodmodel, 
    regression_model, 
    dm
):
    policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
    policy_reward = calc_reward(dataset, policy)
    eval_metrics = eval_policy(neighberhoodmodel, val_data, original_policy_prob, policy)
    action_diff_to_real = np.sqrt(np.mean((emb_a - our_a) ** 2))
    action_delta = np.sqrt(np.mean((original_a - our_a) ** 2))
    context_diff_to_real = np.sqrt(np.mean((emb_x - our_x) ** 2))
    context_delta = np.sqrt(np.mean((original_x - our_x) ** 2))

    row = np.concatenate([
        np.atleast_1d(policy_reward),
        np.atleast_1d(eval_metrics),
        np.atleast_1d(action_diff_to_real),
        np.atleast_1d(action_delta),
        np.atleast_1d(context_diff_to_real),
        np.atleast_1d(context_delta)
    ])
    reg_dm = dm.estimate_policy_value(policy[val_data['x_idx']], regression_model.predict(val_data['x']))
    reg_results = np.array([reg_dm])
    conv_results = np.array([row])
    return get_opl_results_dict(reg_results, conv_results)

## `trainer_trial` Function

This function runs policy learning experiments using offline bandit data and evaluates various estimators.

### Parameters
- **num_runs** (int): Number of experimental runs per training size
- **num_neighbors** (int): Number of neighbors to consider in the neighborhood model
- **num_rounds_list** (list): List of training set sizes to evaluate
- **dataset** (dict): Contains dataset information including embeddings, action probabilities, and reward probabilities
- **batch_size** (int): Batch size for training the policy model
- **num_epochs** (int): Number of training epochs for each experiment
- **lr** (float, default=0.001): Learning rate for the optimizer

### Process Flow
1. Initializes result structures and retrieval models
2. For each training size in `num_rounds_list`:
   - Creates a uniform logging policy and simulates data
   - Generates training data for offline learning
   - Fits regression and neighborhood models for reward estimation
   - Initializes and trains a counterfactual policy model
   - Evaluates policy performance using various estimators
   - Collects metrics on policy reward and embedding quality

### Returns
- **DataFrame**: Results table with rows indexed by training size and columns for various metrics:
  - `policy_rewards`: True expected reward of the learned policy
  - Various estimator errors (`ipw`, `reg_dm`, `conv_dm`, `conv_dr`, `conv_sndr`)
  - Variance metrics for each estimator
  - Embedding quality metrics comparing learned representations to ground truth

### Implementation Notes
- Uses uniform random logging policy for collecting offline data
- Employs Self-Normalized Doubly Robust (SNDR) policy learning
- Measures embedding quality via RMSE to original/ground truth embeddings

In [3]:
def trainer_trial(
    num_runs,
    num_neighbors,
    train_sizes,
    dataset,
    batch_size,
    val_size=2000,
    n_trials=10,    
    prev_best_params=None
):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.backends.cudnn.benchmark = torch.cuda.is_available()
    if torch.cuda.is_available():
        torch.set_float32_matmul_precision("high")

    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]

    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]

    all_user_indices = np.arange(n_users, dtype=np.int64)

    def T(x):
        return torch.as_tensor(x, device=device, dtype=torch.float32)

    def _mean_dict(dicts):
        """
        Robust mean over a list of dicts with numeric/scalar/1D-array values.
        Returns a single dict with elementwise means.
        """
        if not dicts:
            return {}
        keys = dicts[0].keys()
        out = {}
        for k in keys:
            vals = [d[k] for d in dicts if k in d]
            # try to convert each to np.array and average
            arrs = [np.asarray(v) for v in vals]
            # broadcast to same shape if scalars/1D
            stacked = np.stack(arrs, axis=0)
            out[k] = np.mean(stacked, axis=0)
        return out

    # ===== unpack dataset (keep originals safe) =====
    our_x_orig, our_a_orig = our_x, our_a
    emb_x, emb_a = emb_x, emb_a
    original_x, original_a = original_x, original_a
    n_users, n_actions, emb_dim = n_users, n_actions, emb_dim
    all_user_indices = np.arange(n_users, dtype=np.int64)

    dm = DM()
    results = {}
    best_hyperparams_by_size = {}
    last_best_params = prev_best_params if prev_best_params is not None else None

    # ===== baseline (sample size = 0) using get_trial_results =====
    pi_0 = softmax(our_x_orig @ our_a_orig.T, axis=1)
    original_policy_prob = np.expand_dims(pi_0, -1)

    simulation_data = create_simulation_data_from_pi(
        dataset, pi_0, val_size, random_state=0
    )

    # use same data for train/val just to generate the baseline row
    train_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x_orig)
    val_data   = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size), our_x_orig)

    regression_model = RegressionModel(
        n_actions=n_actions, action_context=our_x_orig,
        base_model=LogisticRegression(random_state=12345)
    )

    regression_model.fit(train_data['x'], train_data['a'], train_data['r'])

    neighberhoodmodel = NeighborhoodModel(
        train_data['x_idx'], train_data['a'],
        our_a_orig, our_x_orig, train_data['r'],
        num_neighbors=num_neighbors
    )

    # baseline row produced via get_trial_results
    results[0] = get_trial_results(
        our_x_orig, our_a_orig, emb_x, emb_a, original_x, original_a,
        dataset, val_data, original_policy_prob,
        neighberhoodmodel, regression_model, dm
    )

    # ===== main loop over training sizes =====
    for train_size in train_sizes:

        # we’ll collect per-run trial dicts generated by get_trial_results
        trial_dicts_this_size = []
        best_hyperparams_by_size[train_size] = {}

        # --- prepare a resampling for Optuna’s objective (shared loaders built per-run inside objective) ---
        # We’ll do Optuna per-run (fresh resample + search), then final fit with best params, then get_trial_results.

        for run in range(num_runs):

            # --- resample for this run ---
            pi_0 = softmax(our_x_orig @ our_a_orig.T, axis=1)
            original_policy_prob = np.expand_dims(pi_0, -1)

            simulation_data = create_simulation_data_from_pi(
                dataset, pi_0, train_size + val_size,
                random_state=(run + 1) * (train_size + 17)
            )

            idx_train = np.arange(train_size)
            train_data = get_train_data(n_actions, train_size, simulation_data, idx_train, our_x_orig)
            val_idx   = np.arange(val_size) + train_size
            val_data  = get_train_data(n_actions, val_size, simulation_data, val_idx, our_x_orig)

            num_workers = 4 if torch.cuda.is_available() else 0

            cf_dataset = CustomCFDataset(
                train_data['x_idx'], train_data['a'], train_data['r'], original_policy_prob
            )

            val_dataset = CustomCFDataset(
                val_data['x_idx'], val_data['a'], val_data['r'], original_policy_prob
            )

            # val_loader = DataLoader(
            #     val_dataset, batch_size=val_size, shuffle=False,
            #     pin_memory=torch.cuda.is_available(),
            #     num_workers=num_workers, persistent_workers=bool(num_workers)
            # )


            # --- Optuna objective bound to this run's data ---
            def objective(trial):
                lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
                epochs = trial.suggest_int("num_epochs", 1, 10)
                trial_batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
                trial_num_neighbors = trial.suggest_int("num_neighbors", 3, 15)
                lr_decay = trial.suggest_float("lr_decay", 0.8, 1.0)

                trial_neigh_model = NeighborhoodModel(
                    train_data['x_idx'], train_data['a'],
                    our_a_orig, our_x_orig, train_data['r'],
                    num_neighbors=trial_num_neighbors
                )

                trial_scores_all = torch.as_tensor(
                    trial_neigh_model.predict(all_user_indices),
                    device=device, dtype=torch.float32
                )

                trial_model = LinearCFModel(
                    n_users, n_actions, emb_dim,
                    initial_user_embeddings=T(our_x_orig),
                    initial_actions_embeddings=T(our_a_orig)
                ).to(device)

                assert (not torch.cuda.is_available()) or next(trial_model.parameters()).is_cuda

                final_train_loader = DataLoader(
                    cf_dataset, batch_size=trial_batch_size, shuffle=True,
                    pin_memory=torch.cuda.is_available(),
                    num_workers=num_workers, persistent_workers=bool(num_workers)
                )

                current_lr = lr
                for epoch in range(epochs):
                    if epoch > 0:
                        current_lr *= lr_decay
                        
                    train(
                        trial_model, final_train_loader, trial_scores_all,
                        criterion=SNDRPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                    )

                trial_x, trial_a = trial_model.get_params()
                trial_x = trial_x.detach().cpu().numpy()
                trial_a = trial_a.detach().cpu().numpy()

                pi_i = softmax(trial_x @ trial_a.T, axis=1)

                # print(get_weights_info(pi_i, original_policy_prob))
                # validation reward for selection
                return cv_score_model(val_data, trial_scores_all, pi_i)


            # --- run Optuna for this run ---
            study = optuna.create_study(direction="maximize")
            
            if last_best_params is not None:
                study.enqueue_trial(last_best_params)

            study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

            best_params = study.best_params
            last_best_params = best_params  # optional warm-start to next run
            best_hyperparams_by_size[train_size][run] = {
                "params": best_params,
                "reward": study.best_value
            }


            # --- final training with best params on this run’s data ---
            regression_model = RegressionModel(
                n_actions=n_actions, action_context=our_x_orig,
                base_model=LogisticRegression(random_state=12345)
            )
            regression_model.fit(
                train_data['x'], train_data['a'], train_data['r'],
                original_policy_prob[train_data['x_idx'], train_data['a']].squeeze()
            )

            neighberhoodmodel = NeighborhoodModel(
                train_data['x_idx'], train_data['a'],
                our_a_orig, our_x_orig, train_data['r'],
                num_neighbors=best_params['num_neighbors']
            )
            scores_all = torch.as_tensor(
                neighberhoodmodel.predict(all_user_indices),
                device=device, dtype=torch.float32
            )

            model = LinearCFModel(
                n_users, n_actions, emb_dim,
                initial_user_embeddings=T(our_x_orig),
                initial_actions_embeddings=T(our_a_orig)
            ).to(device)
            assert (not torch.cuda.is_available()) or next(model.parameters()).is_cuda

            train_loader = DataLoader(
                cf_dataset, batch_size=batch_size, shuffle=True,
                pin_memory=torch.cuda.is_available(),
                num_workers=num_workers, persistent_workers=bool(num_workers)
            )

            current_lr = best_params['lr']
            for epoch in range(best_params['num_epochs']):
                if epoch > 0:
                    current_lr *= best_params['lr_decay']
                train(
                    model, train_loader, scores_all,
                    criterion=SNDRPolicyLoss(), num_epochs=1, lr=current_lr, device=str(device)
                )

            # learned embeddings (do NOT overwrite originals)
            learned_x_t, learned_a_t = model.get_params()
            learned_x = learned_x_t.detach().cpu().numpy()
            learned_a = learned_a_t.detach().cpu().numpy()

            # --- produce the per-run result via get_trial_results ---
            trial_res = get_trial_results(
                learned_x, learned_a,          # learned (policy) embeddings
                emb_x, emb_a,                  # ground-truth embedding refs
                original_x, original_a,        # original clean refs
                dataset,
                val_data,                      # use this run's val split
                original_policy_prob,
                neighberhoodmodel,
                regression_model,
                dm
            )

            trial_dicts_this_size.append(trial_res)

            # memory hygiene
            torch.cuda.empty_cache()

        # === aggregate per-run results (mean) and store under this train_size ===
        results[train_size] = _mean_dict(trial_dicts_this_size)

    return pd.DataFrame.from_dict(results, orient='index'), best_hyperparams_by_size

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [4]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 16,
                    # sigma = 0.1,
                    eps = 0.6, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.1
                    )

train_dataset = generate_dataset(dataset_params)

Random Item CTR: 0.07066414727263938
Optimal greedy CTR: 0.09999926940951757
Optimal Stochastic CTR: 0.09995326955796031
Our Initial CTR: 0.08610747363354625


In [5]:
num_runs = 1
batch_size = 200
num_neighbors = 6
n_trials_for_optuna = 10
num_rounds_list = [500, 1000, 2000, 10000, 20000]
# num_rounds_list = [20000]


# Manually define your best parameters
best_params_to_use = {
    "lr": 0.0095,  # Learning rate
    "num_epochs": 5,  # Number of training epochs
    "batch_size": 64,  # Batch size for training
    "num_neighbors": 8,  # Number of neighbors for neighborhood model
    "lr_decay": 0.85  # Learning rate decay factor
}

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [6]:
print("Value of num_rounds_list:", num_rounds_list)

# Run the optimization
df4, best_hyperparams_by_size = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=10000, n_trials=n_trials_for_optuna, prev_best_params=best_params_to_use)

# Print best hyperparameters for each training size
print("\n=== BEST HYPERPARAMETERS BY TRAINING SIZE ===")
for train_size, params in best_hyperparams_by_size.items():
    print(f"\nTraining Size: {train_size}")
    # print(f"Best Reward: {params['reward']:.6f}")
    print("Parameters:")
    for param_name, value in params['params'].items():
        print(f"  {param_name}: {value}")
print("===========================\n")

# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Value of num_rounds_list: [500, 1000, 2000, 10000, 20000]


[I 2025-10-11 18:54:30,352] A new study created in memory with name: no-name-bd3d0adb-2e99-40fd-988e-459f5f2f5a66
Best trial: 0. Best value: 0.0856866:  10%|█         | 1/10 [00:02<00:23,  2.63s/it]

{'gini': np.float64(0.5243697746036257), 'ess': np.float64(4979.857516611228), 'max_wi': np.float64(5.42511752938691), 'min_wi': np.float64(0.01962761801119335)}
Cross-validated error: 0.008975978922104751
[I 2025-10-11 18:54:32,983] Trial 0 finished with value: 0.0856866120785302 and parameters: {'lr': 0.0095, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 8, 'lr_decay': 0.85}. Best is trial 0 with value: 0.0856866120785302.


Best trial: 1. Best value: 0.0904157:  20%|██        | 2/10 [00:04<00:16,  2.03s/it]

{'gini': np.float64(0.005872964376626676), 'ess': np.float64(9998.956446285789), 'max_wi': np.float64(1.028189840753053), 'min_wi': np.float64(0.9671353831446523)}
Cross-validated error: 0.010125484558585925
[I 2025-10-11 18:54:34,599] Trial 1 finished with value: 0.0904157242591587 and parameters: {'lr': 0.00018709859912402802, 'num_epochs': 7, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.9372089833344932}. Best is trial 1 with value: 0.0904157242591587.


Best trial: 2. Best value: 0.0918008:  30%|███       | 3/10 [00:05<00:12,  1.85s/it]

{'gini': np.float64(0.05332732258328814), 'ess': np.float64(9914.243972146633), 'max_wi': np.float64(1.2786191387730608), 'min_wi': np.float64(0.770492536029325)}
Cross-validated error: 0.010468006645507729
[I 2025-10-11 18:54:36,226] Trial 2 finished with value: 0.09180080882331923 and parameters: {'lr': 0.0009832292778037953, 'num_epochs': 8, 'batch_size': 128, 'num_neighbors': 6, 'lr_decay': 0.9166093307143252}. Best is trial 2 with value: 0.09180080882331923.


Best trial: 2. Best value: 0.0918008:  40%|████      | 4/10 [00:07<00:10,  1.74s/it]

{'gini': np.float64(0.0014367616341164368), 'ess': np.float64(9999.937162340399), 'max_wi': np.float64(1.0062946593360196), 'min_wi': np.float64(0.9930561406377443)}
Cross-validated error: 0.010194265484445663
[I 2025-10-11 18:54:37,791] Trial 3 finished with value: 0.09076465301861127 and parameters: {'lr': 0.0001434749010738803, 'num_epochs': 3, 'batch_size': 512, 'num_neighbors': 10, 'lr_decay': 0.9228945262208109}. Best is trial 2 with value: 0.09180080882331923.


Best trial: 4. Best value: 0.0920474:  50%|█████     | 5/10 [00:09<00:08,  1.67s/it]

{'gini': np.float64(0.0031308780847601), 'ess': np.float64(9999.701540969014), 'max_wi': np.float64(1.0138645997112785), 'min_wi': np.float64(0.9850072177442966)}
Cross-validated error: 0.010521598883516017
[I 2025-10-11 18:54:39,353] Trial 4 finished with value: 0.09204742781275198 and parameters: {'lr': 0.0001443272615268543, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.9989489873440737}. Best is trial 4 with value: 0.09204742781275198.


Best trial: 4. Best value: 0.0920474:  60%|██████    | 6/10 [00:10<00:06,  1.66s/it]

{'gini': np.float64(0.011870625323873603), 'ess': np.float64(9995.749171692998), 'max_wi': np.float64(1.0567141247476532), 'min_wi': np.float64(0.9399745357143868)}
Cross-validated error: 0.01048897519594782
[I 2025-10-11 18:54:40,977] Trial 5 finished with value: 0.09184863332652407 and parameters: {'lr': 0.0005774132980352496, 'num_epochs': 5, 'batch_size': 256, 'num_neighbors': 3, 'lr_decay': 0.8496933057061515}. Best is trial 4 with value: 0.09204742781275198.


Best trial: 4. Best value: 0.0920474:  70%|███████   | 7/10 [00:12<00:04,  1.65s/it]

{'gini': np.float64(0.0012095743184314946), 'ess': np.float64(9999.955764572174), 'max_wi': np.float64(1.0056424239255024), 'min_wi': np.float64(0.9932442325438832)}
Cross-validated error: 0.010112819850751903
[I 2025-10-11 18:54:42,624] Trial 6 finished with value: 0.09045400505517123 and parameters: {'lr': 0.00020584108797785374, 'num_epochs': 1, 'batch_size': 256, 'num_neighbors': 12, 'lr_decay': 0.8558585654691033}. Best is trial 4 with value: 0.09204742781275198.


Best trial: 4. Best value: 0.0920474:  80%|████████  | 8/10 [00:13<00:03,  1.66s/it]

{'gini': np.float64(0.024693670307783248), 'ess': np.float64(9981.37317479054), 'max_wi': np.float64(1.1295923616190813), 'min_wi': np.float64(0.8887776129375236)}
Cross-validated error: 0.010135584874367581
[I 2025-10-11 18:54:44,303] Trial 7 finished with value: 0.09052758633798341 and parameters: {'lr': 0.0005295398710204488, 'num_epochs': 4, 'batch_size': 64, 'num_neighbors': 10, 'lr_decay': 0.8101628418554027}. Best is trial 4 with value: 0.09204742781275198.


Best trial: 4. Best value: 0.0920474:  90%|█████████ | 9/10 [00:15<00:01,  1.64s/it]

{'gini': np.float64(0.002256598073497488), 'ess': np.float64(9999.844780708445), 'max_wi': np.float64(1.010696556399499), 'min_wi': np.float64(0.9865985929055855)}
Cross-validated error: 0.01048839528657389
[I 2025-10-11 18:54:45,883] Trial 8 finished with value: 0.09180112565771675 and parameters: {'lr': 0.00013900262708385646, 'num_epochs': 2, 'batch_size': 128, 'num_neighbors': 3, 'lr_decay': 0.8174947625080651}. Best is trial 4 with value: 0.09204742781275198.


Best trial: 4. Best value: 0.0920474: 100%|██████████| 10/10 [00:17<00:00,  1.72s/it]

{'gini': np.float64(0.349324864002174), 'ess': np.float64(7104.1847892717415), 'max_wi': np.float64(3.8727956792412686), 'min_wi': np.float64(0.10712411157975206)}
Cross-validated error: 0.009427994483654492
[I 2025-10-11 18:54:47,550] Trial 9 finished with value: 0.08770407568959147 and parameters: {'lr': 0.005215192549880918, 'num_epochs': 5, 'batch_size': 64, 'num_neighbors': 10, 'lr_decay': 0.9043090530110316}. Best is trial 4 with value: 0.09204742781275198.



[I 2025-10-11 18:54:50,038] A new study created in memory with name: no-name-49070b65-3129-43e9-ad7d-b9c17ac17374
Best trial: 0. Best value: 0.0844128:  10%|█         | 1/10 [00:02<00:22,  2.55s/it]

{'gini': np.float64(0.004676878981958937), 'ess': np.float64(9999.339925905444), 'max_wi': np.float64(1.0195179426605883), 'min_wi': np.float64(0.9783698953331155)}
Cross-validated error: 0.00867939228362645
[I 2025-10-11 18:54:52,584] Trial 0 finished with value: 0.08441276020108382 and parameters: {'lr': 0.0001443272615268543, 'num_epochs': 6, 'batch_size': 512, 'num_neighbors': 7, 'lr_decay': 0.9989489873440737}. Best is trial 0 with value: 0.08441276020108382.


Best trial: 1. Best value: 0.085097:  20%|██        | 2/10 [00:05<00:21,  2.68s/it] 

{'gini': np.float64(0.0071551603716698735), 'ess': np.float64(9998.400668263364), 'max_wi': np.float64(1.0381303079844417), 'min_wi': np.float64(0.9708183454881437)}
Cross-validated error: 0.008832689529478823
[I 2025-10-11 18:54:55,350] Trial 1 finished with value: 0.08509701951782245 and parameters: {'lr': 0.00015436758037102436, 'num_epochs': 4, 'batch_size': 128, 'num_neighbors': 11, 'lr_decay': 0.8636353165769655}. Best is trial 1 with value: 0.08509701951782245.


Best trial: 2. Best value: 0.0892348:  30%|███       | 3/10 [00:08<00:18,  2.69s/it]

{'gini': np.float64(0.43214185903619523), 'ess': np.float64(5911.670677036117), 'max_wi': np.float64(4.876918918311484), 'min_wi': np.float64(0.060521387910777644)}
Cross-validated error: 0.009806552140314173
[I 2025-10-11 18:54:58,064] Trial 2 finished with value: 0.08923475626623324 and parameters: {'lr': 0.00962507130082993, 'num_epochs': 3, 'batch_size': 128, 'num_neighbors': 3, 'lr_decay': 0.9147941106089159}. Best is trial 2 with value: 0.08923475626623324.


Best trial: 2. Best value: 0.0892348:  40%|████      | 4/10 [00:10<00:16,  2.72s/it]

{'gini': np.float64(0.0022327022846622296), 'ess': np.float64(9999.849551025778), 'max_wi': np.float64(1.0095927526605486), 'min_wi': np.float64(0.9900409189294476)}
Cross-validated error: 0.008905382832946373
[I 2025-10-11 18:55:00,826] Trial 3 finished with value: 0.08545989888097782 and parameters: {'lr': 0.0001156701055532078, 'num_epochs': 4, 'batch_size': 512, 'num_neighbors': 13, 'lr_decay': 0.9328331754045914}. Best is trial 2 with value: 0.08923475626623324.


Best trial: 2. Best value: 0.0892348:  50%|█████     | 5/10 [00:13<00:13,  2.72s/it]

{'gini': np.float64(0.09161580021902524), 'ess': np.float64(9753.350151256796), 'max_wi': np.float64(1.3977536904425931), 'min_wi': np.float64(0.6045518275920806)}
Cross-validated error: 0.00893230038555786
[I 2025-10-11 18:55:03,536] Trial 4 finished with value: 0.08557672936672825 and parameters: {'lr': 0.008631267424009164, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 13, 'lr_decay': 0.8714811742043391}. Best is trial 2 with value: 0.08923475626623324.


Best trial: 2. Best value: 0.0892348:  60%|██████    | 6/10 [00:16<00:11,  2.81s/it]

{'gini': np.float64(0.260578892311878), 'ess': np.float64(8225.043873766894), 'max_wi': np.float64(2.6565327985495317), 'min_wi': np.float64(0.25472151116614433)}
Cross-validated error: 0.009465633371307864
[I 2025-10-11 18:55:06,539] Trial 5 finished with value: 0.0877977913192468 and parameters: {'lr': 0.004965652129458273, 'num_epochs': 6, 'batch_size': 256, 'num_neighbors': 13, 'lr_decay': 0.9720007454543466}. Best is trial 2 with value: 0.08923475626623324.


Best trial: 2. Best value: 0.0892348:  60%|██████    | 6/10 [00:17<00:11,  2.96s/it]


[W 2025-10-11 18:55:07,784] Trial 6 failed with parameters: {'lr': 0.005695439892982343, 'num_epochs': 2, 'batch_size': 512, 'num_neighbors': 3, 'lr_decay': 0.8320351523709059} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/roee/Documents/code/OPC/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_621058/4006468808.py", line 144, in objective
    trial_neigh_model = NeighborhoodModel(
                        ^^^^^^^^^^^^^^^^^^
  File "/home/roee/Documents/code/OPC/models.py", line 37, in __init__
    self.fit(action_emb, context_emb, actions, context, rewards)
  File "/home/roee/Documents/code/OPC/models.py", line 44, in fit
    self.calculate_scores()
  File "/home/roee/Documents/code/OPC/models.py", line 63, in calculate_scores
    self.scores = self.context_convolve(context)
                  ^^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.08611765,0.08751241,0.09178279,0.09119538,0.08897455,0.7569287,0.0,0.87627132,0.0
500,0.08610803,0.0851855,0.10581378,0.12150808,0.11026441,0.06799217,0.75692486,0.00025455,0.87627243,0.00012422
1000,0.08654208,0.0851555,0.08791228,0.0901107,0.09101994,0.09363095,0.76531032,0.12496686,0.87972439,0.05553527
2000,0.08734578,0.10292059,0.08114754,0.08605725,0.09373308,0.1089554,0.78380759,0.21309293,0.89112363,0.09057986
10000,0.08616457,0.08526466,0.08415045,0.08838887,0.08738445,0.08521387,0.75946979,0.05993718,0.87631002,0.02535974
20000,0.08658768,0.10000923,0.08665215,0.09042937,0.09040613,0.09037048,0.76479957,0.09941219,0.88121865,0.04027442


In [None]:
# Show the performance metrics
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]


Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.08610747,0.1069709,0.09051612,0.09112201,0.09452505,0.10672373,0.7569287,0.0,0.87627132,0.0
500,0.08705604,0.09221834,,0.08299331,0.08198609,0.07599146,0.79170973,0.24615559,0.88427728,0.08661758
1000,0.08939145,0.11301958,,0.08679147,0.09052395,0.10668506,1.01853061,0.76340735,0.91464524,0.19321758
2000,0.09251861,0.10603409,,0.09028676,0.15397776,0.10628439,1.73862067,1.70789298,0.99652312,0.34170287
10000,0.09268524,0.09704712,,0.09829317,0.09539229,0.09257621,2.18938809,2.22344507,1.03555944,0.40151858
20000,0.09264639,0.09493701,,0.09216787,0.09134166,0.09056984,2.21242505,2.24879912,1.03520993,0.40065441
