In [16]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import sys

sys.path.append("/code")

from tqdm import tqdm
import torch
# device = torch.device('cpu')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# import gym
# import recogym

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
import optuna
# from memory_profiler import profile


from estimators import (
    DirectMethod as DM
)

from simulation_utils import (
    eval_policy,
    generate_dataset,
    create_simulation_data_from_pi,
    get_train_data,
    get_opl_results_dict,
    CustomCFDataset,
    calc_reward
)

from models import (    
    CFModel,
    NeighborhoodModel,
    BPRModel, 
    RegressionModel
)

from training_utils import (
    fit_bpr,
    train,
    validation_loop
 )

from custom_losses import (
    SNDRPolicyLoss,
    BPRLoss
    )

random_state=12345
random_ = check_random_state(random_state)

Using device: cuda


In [17]:
pd.options.display.float_format = '{:,.8f}'.format

## `trainer_trial` Function

This function runs policy learning experiments using offline bandit data and evaluates various estimators.

### Parameters
- **num_runs** (int): Number of experimental runs per training size
- **num_neighbors** (int): Number of neighbors to consider in the neighborhood model
- **num_rounds_list** (list): List of training set sizes to evaluate
- **dataset** (dict): Contains dataset information including embeddings, action probabilities, and reward probabilities
- **batch_size** (int): Batch size for training the policy model
- **num_epochs** (int): Number of training epochs for each experiment
- **lr** (float, default=0.001): Learning rate for the optimizer

### Process Flow
1. Initializes result structures and retrieval models
2. For each training size in `num_rounds_list`:
   - Creates a uniform logging policy and simulates data
   - Generates training data for offline learning
   - Fits regression and neighborhood models for reward estimation
   - Initializes and trains a counterfactual policy model
   - Evaluates policy performance using various estimators
   - Collects metrics on policy reward and embedding quality

### Returns
- **DataFrame**: Results table with rows indexed by training size and columns for various metrics:
  - `policy_rewards`: True expected reward of the learned policy
  - Various estimator errors (`ipw`, `reg_dm`, `conv_dm`, `conv_dr`, `conv_sndr`)
  - Variance metrics for each estimator
  - Embedding quality metrics comparing learned representations to ground truth

### Implementation Notes
- Uses uniform random logging policy for collecting offline data
- Employs Self-Normalized Doubly Robust (SNDR) policy learning
- Measures embedding quality via RMSE to original/ground truth embeddings

In [None]:
def trainer_trial(
                  num_runs,
                  num_neighbors,
                  num_rounds_list,
                  dataset,
                  batch_size,
                  val_size=2000
                  ):
    
    # Define device at the beginning
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    def objective(trial):
    
        # Optuna objective function
        lr = trial.suggest_loguniform("lr", 1e-4, 1e-1)
        epochs = trial.suggest_int("num_epochs", 1, 10)

        trial_neigh_model = NeighborhoodModel(
                                                train_data['x_idx'],
                                                train_data['a'], 
                                                our_a,
                                                our_x, 
                                                train_data['r'], 
                                                num_neighbors=num_neighbors
                                            )
        

        trial_model = CFModel(
                        n_users, 
                        n_actions, 
                        emb_dim, 
                        initial_user_embeddings=torch.tensor(our_x, device=device), 
                        initial_actions_embeddings=torch.tensor(our_a, device=device)
                        )
        
        # Training
        train(trial_model, train_loader, trial_neigh_model, criterion=SNDRPolicyLoss(), num_epochs=epochs, lr=lr, device=device)
        return validation_loop(trial_model, val_loader, trial_neigh_model)
    
    #device = torch.device('cpu')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]
    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]
    
    first = True

    for train_size in num_rounds_list:
        reg_results, conv_results = [], []
        
        for run in range(num_runs):

            pi_0 = softmax(our_x @ our_a.T, axis=1)
            original_policy_prob = np.expand_dims(pi_0, -1)

            simulation_data = create_simulation_data_from_pi(
                                                            dataset,
                                                            pi_0,
                                                            train_size + val_size,
                                                            random_state=(run + 1) * train_size
                                                            )
            # test_data = get_test_data(dataset, simulation_data, n_test_data)

            # idx = np.arange(train_size) + n_test_data
            idx = np.arange(train_size)

            train_data = get_train_data(n_actions, train_size, simulation_data, idx, our_x)
            val_data = get_train_data(n_actions, val_size, simulation_data, np.arange(val_size) + train_size, our_x)

            regression_model = RegressionModel(
                                    n_actions=n_actions,
                                    action_context=our_x,
                                    base_model=LogisticRegression(random_state=12345)
                                    )
            
            regression_model.fit(
                                train_data['x'], 
                                train_data['a'],
                                train_data['r'],
                                original_policy_prob[train_data['x_idx'],
                                train_data['a']].squeeze()
                                )
            
            neighberhoodmodel = NeighborhoodModel(
                                                    train_data['x_idx'],
                                                    train_data['a'], 
                                                    our_a,
                                                    our_x, 
                                                    train_data['r'], 
                                                    num_neighbors=num_neighbors
                                                )
            

            model = CFModel(
                            n_users, 
                            n_actions, 
                            emb_dim, 
                            initial_user_embeddings=torch.tensor(our_x, device=device), 
                            initial_actions_embeddings=torch.tensor(our_a, device=device)
                            )
            
            cf_dataset =  CustomCFDataset(
                                       train_data['x_idx'], 
                                       train_data['a'], 
                                       train_data['r'], 
                                       original_policy_prob
                                       )
            
            train_loader = DataLoader(cf_dataset, batch_size=batch_size, shuffle=True)

            val_dataset =  CustomCFDataset(
                            val_data['x_idx'], 
                            val_data['a'], 
                            val_data['r'], 
                            original_policy_prob
                            )
            
            val_loader = DataLoader(val_dataset, batch_size=val_size, shuffle=True)

            if first:
                policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
                conv_results.append(eval_policy(neighberhoodmodel, train_data, original_policy_prob, policy))
                conv_results[-1] = np.append(calc_reward(dataset, policy), conv_results[-1])
                conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_a-our_a)**2)), np.sqrt(np.mean((original_a-our_a)**2))])
                conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_x-our_x)**2)), np.sqrt(np.mean((original_x-our_x)**2))])
                # bpr_scores = bpr_model.calc_scores(torch.tensor(train_data['x_idx'], device=device, dtype=torch.long)).detach().cpu().numpy()
                reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], regression_model.predict(train_data['x']))
                reg_results.append(reg_dm)
                first = False
                reg_results = np.array(reg_results)
                conv_results = np.array(conv_results)
                results[0] = get_opl_results_dict(reg_results, conv_results)
                reg_results, conv_results = [], []
                
            # Bloss = BPRLoss()
            # bpr_model = BPRModel(
            #                     n_users,
            #                     n_actions,
            #                     emb_x.shape[1], 
            #                     initial_user_embeddings=torch.tensor(our_x, device=device), 
            #                     initial_actions_embeddings=torch.tensor(our_a, device=device)
            #                     )

            study = optuna.create_study(direction="maximize")
            study.optimize(objective, n_trials=10, show_progress_bar=True)
            best_params = study.best_params

            neighberhoodmodel = NeighborhoodModel(
                                                    train_data['x_idx'],
                                                    train_data['a'], 
                                                    our_a,
                                                    our_x, 
                                                    train_data['r'], 
                                                    num_neighbors=num_neighbors
                                                )
            

            model = CFModel(
                            n_users, 
                            n_actions, 
                            emb_dim, 
                            initial_user_embeddings=torch.tensor(our_x, device=device), 
                            initial_actions_embeddings=torch.tensor(our_a, device=device)
                            )

            train(model, train_loader, neighberhoodmodel, criterion=SNDRPolicyLoss(), num_epochs=best_params['num_epochs'], lr=best_params['lr'], device=device)
            # fit_bpr(bpr_model, Bloss, train_loader, num_epochs=3, lr=0.001, device=device)
            # neighborhood_model.update(model.get_params()[0].detach().numpy(), model.get_params()[1].detach().numpy())'

            our_x, our_a = model.get_params()
            our_a, our_x = our_a.detach().cpu().numpy(), our_x.detach().cpu().numpy()

            policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)

            # bpr_scores = bpr_model.calc_scores(torch.tensor(train_data['x_idx'], device=device, dtype=torch.long)).detach().cpu().numpy()
            # reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], bpr_scores)
            reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], regression_model.predict(train_data['x']))

            reg_results.append(reg_dm)

            # conv_results.append(eval_policy(neighberhoodmodel, test_data, original_policy_prob[test_data['x_idx']], policy))
            conv_results.append(eval_policy(neighberhoodmodel, train_data, original_policy_prob, policy))

            conv_results[-1] = np.append(calc_reward(dataset, policy), conv_results[-1])
            conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_a-our_a)**2)), np.sqrt(np.mean((original_a-our_a)**2))])

            # temp.append(np.mean((emb_a-our_a)**2, axis=0))

            conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_x-our_x)**2)), np.sqrt(np.mean((original_x-our_x)**2))])
            
            our_a, our_x = original_a.copy(), original_x.copy()

        reg_results = np.array(reg_results)
        conv_results = np.array(conv_results)

        results[train_size] = get_opl_results_dict(reg_results, conv_results)
    
    return pd.DataFrame.from_dict(results, orient='index')

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [20]:
num_runs = 1

In [21]:
dataset_params = dict(
                    n_actions= 500,
                    n_users = 500,
                    emb_dim = 4,
                    # sigma = 0.1,
                    eps = 0.4, # this is the epsilon for the noise in the ground truth policy representation
                    ctr = 0.2
                    )

train_dataset = generate_dataset(dataset_params)

Random Item CTR: 0.14805474537506452
Optimal greedy CTR: 0.1993706263086395
Optimal Stochastic CTR: 0.19237265135418052
Our Initial CTR: 0.17421144896729363


In [22]:
train_dataset.keys()

dict_keys(['emb_a', 'our_a', 'original_a', 'emb_x', 'our_x', 'original_x', 'q_x_a', 'n_actions', 'n_users', 'emb_dim', 'user_prior'])

In [23]:
num_runs = 1
batch_size = 200
num_neighbors = 6
num_rounds_list = [100, 1000, 5000]

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [13]:
df4 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=35000)

[I 2025-08-20 00:56:15,543] A new study created in memory with name: no-name-72b1b479-85f2-43ca-9141-3ae767c95a6c
Best trial: 0. Best value: 0.0597385:  10%|█         | 1/10 [00:03<00:29,  3.32s/it]

[I 2025-08-20 00:56:18,862] Trial 0 finished with value: 0.059738499466196124 and parameters: {'lr': 0.00010895008936317886, 'num_epochs': 2}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  20%|██        | 2/10 [00:05<00:22,  2.85s/it]

[I 2025-08-20 00:56:21,385] Trial 1 finished with value: 0.059737989427549376 and parameters: {'lr': 0.0002640820740793505, 'num_epochs': 1}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  30%|███       | 3/10 [00:08<00:18,  2.69s/it]

[I 2025-08-20 00:56:23,876] Trial 2 finished with value: 0.05972951024368253 and parameters: {'lr': 0.0001288513640104121, 'num_epochs': 8}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  40%|████      | 4/10 [00:10<00:15,  2.61s/it]

[I 2025-08-20 00:56:26,360] Trial 3 finished with value: 0.059681123958409577 and parameters: {'lr': 0.000671615764072603, 'num_epochs': 8}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  50%|█████     | 5/10 [00:13<00:13,  2.64s/it]

[I 2025-08-20 00:56:29,065] Trial 4 finished with value: 0.05973626383405789 and parameters: {'lr': 0.00010507512555306303, 'num_epochs': 4}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  60%|██████    | 6/10 [00:16<00:10,  2.63s/it]

[I 2025-08-20 00:56:31,669] Trial 5 finished with value: 0.059716355951880074 and parameters: {'lr': 0.0003165569769377956, 'num_epochs': 7}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  70%|███████   | 7/10 [00:18<00:07,  2.59s/it]

[I 2025-08-20 00:56:34,178] Trial 6 finished with value: 0.05972137075709563 and parameters: {'lr': 0.0002521050277769762, 'num_epochs': 7}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  80%|████████  | 8/10 [00:21<00:05,  2.65s/it]

[I 2025-08-20 00:56:36,952] Trial 7 finished with value: 0.05972496580699756 and parameters: {'lr': 0.00028816280150273637, 'num_epochs': 5}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385:  90%|█████████ | 9/10 [00:24<00:02,  2.63s/it]

[I 2025-08-20 00:56:39,555] Trial 8 finished with value: 0.05972290383313898 and parameters: {'lr': 0.00016266660437986273, 'num_epochs': 10}. Best is trial 0 with value: 0.059738499466196124.


Best trial: 0. Best value: 0.0597385: 100%|██████████| 10/10 [00:26<00:00,  2.66s/it]


[I 2025-08-20 00:56:42,160] Trial 9 finished with value: 0.05973503993935411 and parameters: {'lr': 0.00026551446572454444, 'num_epochs': 2}. Best is trial 0 with value: 0.059738499466196124.


[I 2025-08-20 00:56:45,920] A new study created in memory with name: no-name-309970e3-f742-44d3-be34-d3fbac95d798
Best trial: 0. Best value: 0.0796372:  10%|█         | 1/10 [00:05<00:46,  5.15s/it]

[I 2025-08-20 00:56:51,065] Trial 0 finished with value: 0.07963718864633709 and parameters: {'lr': 0.00031669020900083325, 'num_epochs': 9}. Best is trial 0 with value: 0.07963718864633709.


Best trial: 0. Best value: 0.0796372:  20%|██        | 2/10 [00:10<00:40,  5.06s/it]

[I 2025-08-20 00:56:56,072] Trial 1 finished with value: 0.0796174847990189 and parameters: {'lr': 0.0003325972397360591, 'num_epochs': 2}. Best is trial 0 with value: 0.07963718864633709.


Best trial: 2. Best value: 0.0796818:  30%|███       | 3/10 [00:15<00:35,  5.07s/it]

[I 2025-08-20 00:57:01,148] Trial 2 finished with value: 0.07968181354760832 and parameters: {'lr': 0.0009163032794591783, 'num_epochs': 7}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818:  40%|████      | 4/10 [00:20<00:31,  5.26s/it]

[I 2025-08-20 00:57:06,695] Trial 3 finished with value: 0.07961508847744811 and parameters: {'lr': 0.00029569804678457746, 'num_epochs': 3}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818:  50%|█████     | 5/10 [00:26<00:26,  5.33s/it]

[I 2025-08-20 00:57:12,161] Trial 4 finished with value: 0.07961253057442715 and parameters: {'lr': 0.00015786229636644586, 'num_epochs': 6}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818:  60%|██████    | 6/10 [00:31<00:20,  5.17s/it]

[I 2025-08-20 00:57:17,003] Trial 5 finished with value: 0.07961328912649371 and parameters: {'lr': 0.00047794159557080877, 'num_epochs': 1}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818:  70%|███████   | 7/10 [00:35<00:15,  5.07s/it]

[I 2025-08-20 00:57:21,876] Trial 6 finished with value: 0.07961642489081437 and parameters: {'lr': 0.0009076401165972231, 'num_epochs': 1}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818:  80%|████████  | 8/10 [00:40<00:09,  4.99s/it]

[I 2025-08-20 00:57:26,685] Trial 7 finished with value: 0.07960997618507543 and parameters: {'lr': 0.000128626698878024, 'num_epochs': 5}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818:  90%|█████████ | 9/10 [00:45<00:04,  4.98s/it]

[I 2025-08-20 00:57:31,635] Trial 8 finished with value: 0.07965698670679999 and parameters: {'lr': 0.0005414921606618851, 'num_epochs': 5}. Best is trial 2 with value: 0.07968181354760832.


Best trial: 2. Best value: 0.0796818: 100%|██████████| 10/10 [00:50<00:00,  5.08s/it]


[I 2025-08-20 00:57:36,708] Trial 9 finished with value: 0.07963042723537915 and parameters: {'lr': 0.0003509915763111679, 'num_epochs': 7}. Best is trial 2 with value: 0.07968181354760832.


[I 2025-08-20 00:58:00,050] A new study created in memory with name: no-name-285c23ff-aea3-45ba-8cee-a0e0d726373f
Best trial: 0. Best value: 0.115142:  10%|█         | 1/10 [00:22<03:21, 22.39s/it]

[I 2025-08-20 00:58:22,437] Trial 0 finished with value: 0.11514241973129649 and parameters: {'lr': 0.00021840401521487708, 'num_epochs': 9}. Best is trial 0 with value: 0.11514241973129649.


Best trial: 1. Best value: 0.115198:  20%|██        | 2/10 [00:44<02:57, 22.21s/it]

[I 2025-08-20 00:58:44,519] Trial 1 finished with value: 0.11519835250110828 and parameters: {'lr': 0.0008523001567980232, 'num_epochs': 7}. Best is trial 1 with value: 0.11519835250110828.


Best trial: 1. Best value: 0.115198:  30%|███       | 3/10 [01:06<02:34, 22.08s/it]

[I 2025-08-20 00:59:06,438] Trial 2 finished with value: 0.11516182042145146 and parameters: {'lr': 0.0005261080568123465, 'num_epochs': 6}. Best is trial 1 with value: 0.11519835250110828.


Best trial: 3. Best value: 0.11523:  40%|████      | 4/10 [01:29<02:14, 22.36s/it] 

[I 2025-08-20 00:59:29,231] Trial 3 finished with value: 0.11522962722463213 and parameters: {'lr': 0.0005515118413429057, 'num_epochs': 10}. Best is trial 3 with value: 0.11522962722463213.


Best trial: 3. Best value: 0.11523:  50%|█████     | 5/10 [01:51<01:51, 22.37s/it]

[I 2025-08-20 00:59:51,612] Trial 4 finished with value: 0.11511319983106702 and parameters: {'lr': 0.00017429541849356308, 'num_epochs': 9}. Best is trial 3 with value: 0.11522962722463213.


Best trial: 3. Best value: 0.11523:  60%|██████    | 6/10 [02:13<01:28, 22.18s/it]

[I 2025-08-20 01:00:13,420] Trial 5 finished with value: 0.11513460269641176 and parameters: {'lr': 0.0008037241817958571, 'num_epochs': 3}. Best is trial 3 with value: 0.11522962722463213.


Best trial: 3. Best value: 0.11523:  70%|███████   | 7/10 [02:34<01:05, 21.99s/it]

[I 2025-08-20 01:00:35,025] Trial 6 finished with value: 0.11511220541627182 and parameters: {'lr': 0.0002733015944364403, 'num_epochs': 3}. Best is trial 3 with value: 0.11522962722463213.


Best trial: 3. Best value: 0.11523:  80%|████████  | 8/10 [02:57<00:44, 22.12s/it]

[I 2025-08-20 01:00:57,433] Trial 7 finished with value: 0.11515818789949663 and parameters: {'lr': 0.0003414270092671607, 'num_epochs': 8}. Best is trial 3 with value: 0.11522962722463213.


Best trial: 3. Best value: 0.11523:  90%|█████████ | 9/10 [03:19<00:22, 22.24s/it]

[I 2025-08-20 01:01:19,924] Trial 8 finished with value: 0.11514077411491108 and parameters: {'lr': 0.00020208941733531894, 'num_epochs': 10}. Best is trial 3 with value: 0.11522962722463213.


Best trial: 3. Best value: 0.11523: 100%|██████████| 10/10 [03:41<00:00, 22.16s/it]


[I 2025-08-20 01:01:41,640] Trial 9 finished with value: 0.11508644752396731 and parameters: {'lr': 0.00010616194706309917, 'num_epochs': 5}. Best is trial 3 with value: 0.11522962722463213.


In [14]:
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0))


Torch version: 2.4.0+cu124
CUDA available: True
CUDA version: 12.4
GPU: NVIDIA RTX 6000 Ada Generation


In [15]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.17421145,0.21861102,0.18602298,0.17799735,0.18827766,0.21464792,0.55681423,0.0,0.47484491,0.0
100,0.17421366,0.21859848,0.18602487,0.17799862,0.18826666,0.21464062,0.5568063,0.00021777,0.4748423,8.877e-05
1000,0.17434445,0.18574171,0.17524174,0.17857121,0.18071052,0.18645199,0.55774376,0.01835192,0.4745455,0.01017083
5000,0.17448628,0.18943531,0.18122277,0.18266715,0.18193198,0.17994498,0.55813659,0.03636377,0.47434188,0.02381384


In [None]:
df4

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.14805475,0.1474,0.14741001,0.16662598,0.1474,0.1474,0.0,0.0,0.0,0.0,0.0,1.39203558,0.0,1.18711227,0.0
30000,0.14805615,0.14738587,0.14741001,0.16662598,0.14740833,0.14738587,0.0,0.0,0.0,0.0,0.0,1.39245616,0.00427159,1.18787022,0.00409119
60000,0.14804445,0.14670924,0.14675499,0.33325195,0.14722087,0.14670924,0.0,0.0,0.0,0.0,0.0,1.39336612,0.013553,1.19154604,0.01465414
80000,0.14805309,0.1465878,0.14654124,0.16662598,0.14661765,0.1465878,0.0,0.0,0.0,0.0,0.0,1.39277489,0.00881483,1.18945526,0.00949291
90000,0.14800507,0.14485716,0.14544778,0.0,0.14476698,0.14485716,0.0,0.0,0.0,0.0,0.0,1.39511598,0.01295597,1.18752366,0.00677315


In [None]:
num_rounds_list = [3000, 6000, 8000, 9000]

### 2

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.001$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [None]:
df5 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, val_size=3500)

NameError: name 'trainer_trial' is not defined

In [None]:
df5

### 3

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.003$$
$$n_{epochs} = 10$$
$$BatchSize=50$$

In [None]:
df6 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size)

In [None]:
df6

### 4

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.05$$
$$n_{epochs} = 10$$
$$BatchSize=150$$

In [None]:
df7 = trainer_trial(num_runs, num_neighbors, num_rounds_list[:-3], train_dataset, batch_size+100, num_epochs=10, lr=0.05)

In [None]:
df7