In [36]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm
import torch
device = torch.device('cpu')


import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
from abc import ABCMeta


from from_saito import (
    DirectMethod as DM,
)

from my_utils import (
    eval_policy,
    generate_dataset,
    create_simluation_data_from_pi,
    get_train_data,
    CFModel,
    CustomCFDataset,
    NeighborhoodModel,
    # BPRModel
)
random_state=12345
random_ = check_random_state(random_state)

In [109]:
class BPRModel(nn.Module):
    def __init__(self, num_users, num_actions, embedding_dim, 
                 initial_user_embeddings=None, initial_actions_embeddings=None):
        super(BPRModel, self).__init__()

        self.actions = torch.arange(num_actions)
        self.users = torch.arange(num_users)
        
        # Initialize user and actions embeddings
        if initial_user_embeddings is None:
            self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        else:
            # If initial embeddings are provided, set them as the embeddings
            self.user_embeddings = nn.Embedding.from_pretrained(initial_user_embeddings, freeze=False)
        
        if initial_actions_embeddings is None:
            self.actions_embeddings = nn.Embedding(num_actions, embedding_dim)
        else:
            # If initial embeddings are provided, set them as the embeddings
            self.actions_embeddings = nn.Embedding.from_pretrained(initial_actions_embeddings, freeze=False)


    def forward(self, user_ids, pos_action_ids, neg_action_ids):
        user_embeds = self.user_embeddings(user_ids)
        pos_action_embeds = self.actions_embeddings(pos_action_ids)
        neg_action_embeds = self.actions_embeddings(neg_action_ids)

        # Compute dot product between user and action embeddings
        pos_scores = (user_embeds * pos_action_embeds).sum(dim=1)
        neg_scores = (user_embeds * neg_action_embeds).sum(dim=1)

        return pos_scores, neg_scores
    
    def calc_scores(self, user_ids):
                # Get embeddings for users and actions
        user_embedding = self.user_embeddings(user_ids)
        actions_embedding = self.actions_embeddings
        
        # Calculate dot product between user and actions embeddings
        scores = user_embedding @ actions_embedding(self.actions).T
        
        # Apply softmax to get the predicted probability distribution
        return F.softmax(scores, dim=1).unsqueeze(-1)
    
    def to(self, device):
        # Move the module itself
        super().to(device)
        self.actions = self.actions.to(device)
        self.users = self.users.to(device)
        return self

## `calc_reward` Function
Calculates the expected reward of a policy by computing the weighted average of true reward probabilities.

### Parameters
- `dataset` (dict): Contains dataset information including `q_x_a`, the true reward probabilities for each user-action pair
- `policy` (numpy.ndarray): Policy probabilities with shape [n_users, n_actions, 1]

### Returns
- `numpy.ndarray`: A single-element array containing the expected policy reward

### Mathematical Formulation
Implements: $R_{gt} = \frac{1}{n}\sum_{i=1}^{n}\sum_{j=1}^{m}{\pi_{ij} \cdot p_{ij}}$

Where:
- $\pi_{ij}$ is the policy probability for user $i$ choosing action $j$
- $p_{ij}$ is the true reward probability for user $i$ choosing action $j$ (stored in `q_x_a`)

In [38]:
def calc_reward(dataset, policy):
    return np.array([np.sum(dataset['q_x_a'] * policy.squeeze(), axis=1).mean()])

In [39]:
pd.options.display.float_format = '{:,.4f}'.format

## `get_opl_results_dict` Function

This function processes evaluation results from various offline policy learning (OPL) estimators and computes summary statistics.

### Parameters
- **reg_results** (numpy.ndarray): Results from regression-based direct method estimator
- **conv_results** (numpy.ndarray): Results from various estimators including true rewards and embeddings quality metrics

### Returns
- **dict**: A dictionary containing the following metrics:
  - `policy_rewards`: Mean true reward of the learned policy
  - Error metrics (absolute difference between estimator and true reward):
    - `ipw`: Inverse Propensity Weighting estimator error
    - `reg_dm`: Regression-based Direct Method estimator error
    - `conv_dm`: Convolution-based Direct Method estimator error
    - `conv_dr`: Convolution-based Doubly Robust estimator error
    - `conv_sndr`: Convolution-based Self-Normalized Doubly Robust estimator error
  - Variance metrics for each estimator:
    - `ipw_var`, `reg_dm_var`, `conv_dm_var`, `conv_dr_var`, `conv_sndr_var`
  - Embedding quality metrics:
    - `action_diff_to_real`: RMSE between learned and real action embeddings
    - `action_delta`: RMSE between learned and original action embeddings
    - `context_diff_to_real`: RMSE between learned and real context embeddings
    - `context_delta`: RMSE between learned and original context embeddings

### Implementation Notes
- Uses the first column of `conv_results` as the ground truth reward
- Contains commented-out code for percentage error calculations
- Computes absolute errors rather than signed differences

In [40]:
def get_opl_results_dict(reg_results, conv_results):
    reward = conv_results[:, 0]
    return    dict(
                policy_rewards=np.mean(reward),
                ipw=np.mean(abs(conv_results[: ,3] - reward)),
                reg_dm=np.mean(abs(reg_results - reward)),
                conv_dm=np.mean(abs(conv_results[: ,1] - reward)),
                conv_dr=np.mean(abs(conv_results[: ,2] - reward)),
                conv_sndr=np.mean(abs(conv_results[: ,4] - reward)),

                ipw_var=np.var(conv_results[: ,3]),
                reg_dm_var=np.var(reg_results),
                conv_dm_var=np.var(conv_results[: ,1]),
                conv_dr_var=np.var(conv_results[: ,2]),
                conv_sndr_var=np.var(conv_results[: ,4]),

                                
                # ipw_p_err=np.mean(abs(conv_results[: ,3] - reward) / reward) * 100,
                # reg_dm_p_err=np.mean(abs(reg_results - reward) / reward) * 100,
                # conv_dm_p_err=np.mean(abs(conv_results[: ,1] - reward) / reward) * 100,
                # conv_dr_p_err=np.mean(abs(conv_results[: ,2] - reward) / reward) * 100,
                # conv_sndr_p_err=np.mean(abs(conv_results[: ,4] - reward) / reward) * 100,
                
                action_diff_to_real=np.mean(conv_results[: ,5]),
                action_delta=np.mean(conv_results[: ,6]),
                context_diff_to_real=np.mean(conv_results[: ,7]),
                context_delta=np.mean(conv_results[: ,8])
                )

## `IPWPolicyLoss` Class

This class implements an Inverse Propensity Weighting (IPW) loss function for counterfactual policy learning from offline bandit data.

### Mathematical Formulation
The loss implements the IPW estimator as a differentiable function:

$$\mathcal{L}_{IPW} = \frac{1}{n}\sum_{i=1}^{n} \frac{\pi_e(a_i|x_i)}{\pi_0(a_i|x_i)} \cdot r_i \cdot \log(\pi_e(a_i|x_i))$$

Where:
- $\pi_e(a_i|x_i)$ is the probability of the new policy taking action $a_i$ for context $x_i$
- $\pi_0(a_i|x_i)$ is the propensity score (probability of the logging policy)
- $r_i$ is the observed reward
- $n$ is the batch size

### Parameters
- **log_eps** (float): Small constant added to prevent numerical instability in log calculations

### Method: `forward`
- **pscore** (Tensor): Propensity scores from original logging policy
- **scores** (Tensor): Model-estimated reward predictions for each action (not being used)
- **policy_prob** (Tensor): Probabilities from current policy being optimized
- **original_policy_rewards** (Tensor): Observed rewards from logged data
- **original_policy_actions** (Tensor): Actions that were taken in the logged data

### Implementation Notes
- Importance weights (`iw`) are detached from the computation graph
- Uses the REINFORCE policy gradient method
- The implementation includes commented-out code for more advanced variants

In [41]:
class IPWPolicyLoss(nn.Module):
    def __init__(self, log_eps=1e-10):
        super(IPWPolicyLoss, self).__init__()
        self.log_eps = log_eps

    def forward(self, pscore, scores, policy_prob, original_policy_rewards, original_policy_actions):
        n = original_policy_actions.shape[0]

        pi_e_at_position = policy_prob[torch.arange(n), original_policy_actions].squeeze()
        iw = pi_e_at_position / pscore
        iw = iw.detach()
        # q_hat_at_position = scores[torch.arange(n), original_policy_actions].squeeze()
        # dm_grads = (scores * policy_prob.detach() * torch.log(policy_prob)).sum(dim=1)
        log_pi = torch.log(pi_e_at_position).squeeze()
        
        # reinforce trick step
        # reinforce_grad = ((iw * (original_policy_rewards - q_hat_at_position) * log_pi) / iw.sum()) + dm_grads
        reinforce_grad = iw * original_policy_rewards * log_pi
        
        return reinforce_grad.mean()


## `SNDRPolicyLoss` Class

This class implements a Self-Normalized Doubly Robust (SNDR) loss function for counterfactual policy learning from offline bandit data.

### Mathematical Formulation
The loss combines IPW with direct method estimates for variance reduction:

$$\mathcal{L}_{SNDR} = \frac{1}{n}\sum_{i=1}^{n} \left( \frac{\sum_{i=1}^{n}\frac{\pi_e(a_i|x_i)}{\pi_0(a_i|x_i)} \cdot (r_i - \hat{q}(x_i,a_i))}{\sum_{i=1}^{n}\frac{\pi_e(a_i|x_i)}{\pi_0(a_i|x_i)}} + \sum_{a}\pi_e(a|x_i)\hat{q}(x_i,a) \right) \cdot \log(\pi_e(a_i|x_i))$$

Where:
- $\pi_e(a_i|x_i)$ is the probability from the new policy
- $\pi_0(a_i|x_i)$ is the propensity score from the logging policy
- $r_i$ is the observed reward
- $\hat{q}(x_i,a_i)$ is the estimated reward from a direct model
- $n$ is the batch size

### Parameters
- **log_eps** (float): Small constant added to prevent numerical instability in log calculations

### Method: `forward`
- **pscore** (Tensor): Propensity scores from original logging policy
- **scores** (Tensor): Model-estimated reward predictions for each action
- **policy_prob** (Tensor): Probabilities from current policy being optimized
- **original_policy_rewards** (Tensor): Observed rewards from logged data
- **original_policy_actions** (Tensor): Actions that were taken in the logged data

### Implementation Notes
- Combines direct method rewards with importance-weighted corrections
- Self-normalizes the importance weights by dividing by their sum
- Generally provides lower variance estimates than pure IPW approaches

In [42]:
class SNDRPolicyLoss(nn.Module):
    def __init__(self, log_eps=1e-10):
        super(SNDRPolicyLoss, self).__init__()
        self.log_eps = log_eps

    def forward(self, pscore, scores, policy_prob, original_policy_rewards, original_policy_actions):
        n = original_policy_actions.shape[0]

        pi_e_at_position = policy_prob[torch.arange(n), original_policy_actions].squeeze()
        iw = pi_e_at_position / pscore
        iw = iw.detach()
        q_hat_at_position = scores[torch.arange(n), original_policy_actions].squeeze()
        dm_reward = (scores * policy_prob.detach()).sum(dim=1)
        log_pi = torch.log(pi_e_at_position).squeeze()
        
        # reinforce trick step
        r_hat = ((iw * (original_policy_rewards - q_hat_at_position)) / iw.sum()) + dm_reward
        reinforce_grad = r_hat * log_pi
        return reinforce_grad.mean()

In [43]:
class BPRLoss(nn.Module):
    def __init__(self, log_eps=1e-10):
        super(BPRLoss, self).__init__()

    def forward(self, pscore, scores, policy_prob, original_policy_rewards, original_policy_actions):
        num_items = policy_prob.shape[1]
        batch_size = scores.size(0)

        # Filter to only positive-reward samples (reward == 1)
        mask = original_policy_rewards > 0
        if mask.sum() == 0:
            return torch.tensor(0.0, device=scores.device)

        pos_idx = torch.arange(batch_size)[mask]
        pos_actions = original_policy_actions[mask]
        pos_scores = scores[pos_idx, pos_actions]
        pos_pscore = pscore[mask]

        # Sample negative actions not equal to the positive ones
        neg_actions = torch.randint(0, num_items, size=(pos_idx.size(0),), device=scores.device)
        conflict = neg_actions == pos_actions
        
        while conflict.any():
            neg_actions[conflict] = torch.randint(0, num_items, size=(conflict.sum(),), device=scores.device)
            conflict = neg_actions == pos_actions

        neg_scores = scores[pos_idx, neg_actions]

        # Compute pairwise BPR loss
        bpr = -torch.log(torch.sigmoid(pos_scores - neg_scores) + 1e-10)

        # Importance weighting using inverse propensity score
        loss = (bpr / (pos_pscore + 1e-6)).mean()

        return loss

## `train` Function

This function trains a policy model with Self-Normalized Doubly Robust (SNDR) loss for counterfactual policy learning.

### Parameters
- **model** (CFModel): The policy model to be trained, which maps users to action probabilities
- **train_loader** (DataLoader): PyTorch data loader containing training data with user indices, actions, rewards, and logging policy probabilities
- **neighborhood_model** (NeighborhoodModel): Model that provides reward estimates based on neighborhood information
- **num_epochs** (int, default=1): Number of training epochs
- **lr** (float, default=0.0001): Learning rate for the Adam optimizer
- **device** (str or torch.device, default='cpu'): Device to run the training on

### Process Flow
1. Initializes an Adam optimizer and SNDR loss criterion
2. For each epoch:
   - Iterates through batches from the data loader
   - Moves data to specified device (CPU/GPU)
   - Gets policy probabilities by running the model on user indices
   - Computes propensity scores from the logging policy
   - Gets reward predictions from neighborhood model
   - Calculates loss using the SNDR criterion
   - Performs backpropagation and optimization
   - Tracks and displays running loss statistics

### Implementation Notes
- Uses `tqdm` for progress visualization
- Contains commented-out code for neighborhood model updates

In [44]:
# 4. Define the training function
def train(model, train_loader, neighborhood_model, num_epochs=1, lr=0.0001, device='cpu'):

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr) # here we can change the learning rate
    criterion = SNDRPolicyLoss()

    model.train() # Set the model to training mode
    tq = tqdm(range(num_epochs))
    for epoch in tq:
        running_loss = 0.0
        total_samples = 0
        
        for user_idx, action_idx, rewards, original_prob in train_loader:
            # Move data to GPU if available
            if torch.cuda.is_available():
                user_idx = user_idx.to(device) 
                action_idx = action_idx.to(device)
                rewards = rewards.to(device)
                original_prob = original_prob.to(device) 
            
            # Forward pass
            policy = model(user_idx)
            pscore = original_prob[torch.arange(user_idx.shape[0]), action_idx.type(torch.long)]
            
            scores = torch.tensor(neighborhood_model.predict(user_idx.cpu().numpy()), device='cpu')
            
            loss = criterion(
                              pscore,
                              scores,
                              policy, 
                              rewards, 
                              action_idx.type(torch.long), 
                              )
            
            # Zero the gradients Backward pass and optimization
            optimizer.zero_grad()

            loss.backward()                        
            optimizer.step()
            
            # update neighborhood
            # action_emb, context_emb = model.get_params()
            
            # Calculate running loss and accuracy
            running_loss += loss.item()
            total_samples += 1

            # Print statistics after each epoch
            epoch_loss = running_loss / total_samples
            tq.set_description(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
            
        # neighborhood_model.update(action_emb.detach().numpy(), context_emb.detach().numpy())


In [126]:
def fit_bpr(model, loss_fn, data_loader, num_epochs=5, lr=0.0001, device='cpu'):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr) # here we can change the learning rate

    model.train() # Set the model to training mode
    tq = tqdm(range(num_epochs))
    for epoch in tq:
        running_loss = 0.0
        total_samples = 0
        
        for user_idx, action_idx, rewards, original_prob in data_loader:
            # Move data to GPU if available
            if torch.cuda.is_available():
                user_idx = user_idx.to(device) 
                action_idx = action_idx.to(device)
                rewards = rewards.to(device)
                original_prob = original_prob.to(device) 
            
            # Forward pass
            policy = model.calc_scores(user_idx)
            pscore = original_prob[torch.arange(user_idx.shape[0]), action_idx.type(torch.long)]
            
            # scores = torch.tensor(model.calc_scores(user_idx.cpu().numpy()), device='cpu')
            scores = policy.clone()
            
            loss = loss_fn(
                            pscore,
                            scores,
                            policy, 
                            rewards, 
                            action_idx.type(torch.long), 
                            )
            
            # Zero the gradients Backward pass and optimization
            optimizer.zero_grad()

            loss.backward()                        
            optimizer.step()
            
            # update neighborhood
            # action_emb, context_emb = model.get_params()
            
            # Calculate running loss and accuracy
            running_loss += loss.item()
            total_samples += 1

            # Print statistics after each epoch
            epoch_loss = running_loss / total_samples
            tq.set_description(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

## `trainer_trial` Function

This function runs policy learning experiments using offline bandit data and evaluates various estimators.

### Parameters
- **num_runs** (int): Number of experimental runs per training size
- **num_neighbors** (int): Number of neighbors to consider in the neighborhood model
- **num_rounds_list** (list): List of training set sizes to evaluate
- **dataset** (dict): Contains dataset information including embeddings, action probabilities, and reward probabilities
- **batch_size** (int): Batch size for training the policy model
- **num_epochs** (int): Number of training epochs for each experiment
- **lr** (float, default=0.001): Learning rate for the optimizer

### Process Flow
1. Initializes result structures and retrieval models
2. For each training size in `num_rounds_list`:
   - Creates a uniform logging policy and simulates data
   - Generates training data for offline learning
   - Fits regression and neighborhood models for reward estimation
   - Initializes and trains a counterfactual policy model
   - Evaluates policy performance using various estimators
   - Collects metrics on policy reward and embedding quality

### Returns
- **DataFrame**: Results table with rows indexed by training size and columns for various metrics:
  - `policy_rewards`: True expected reward of the learned policy
  - Various estimator errors (`ipw`, `reg_dm`, `conv_dm`, `conv_dr`, `conv_sndr`)
  - Variance metrics for each estimator
  - Embedding quality metrics comparing learned representations to ground truth

### Implementation Notes
- Uses uniform random logging policy for collecting offline data
- Employs Self-Normalized Doubly Robust (SNDR) policy learning
- Measures embedding quality via RMSE to original/ground truth embeddings

In [None]:
def trainer_trial(
                  num_runs,
                  num_neighbors,
                  num_rounds_list,
                  dataset,
                  batch_size,
                  num_epochs,
                  lr=0.001
                  ):
    # Define device at the beginning
    device = torch.device('cpu')  # Force CPU usage
    
    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]
    
    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]
    first = True
    zero = True
    for train_size in num_rounds_list:
        reg_results, conv_results = [], []
        for run in range(num_runs):

            pi_0 = np.ones_like(dataset["q_x_a"])/(dataset["n_actions"])
            original_policy_prob = np.expand_dims(pi_0, -1)
            simulation_data = create_simluation_data_from_pi(
                                                            pi_0,
                                                            dataset["q_x_a"],
                                                            dataset["n_users"],
                                                            dataset["n_actions"],
                                                            random_state=train_size*(run+1)
                                                            )
            
            # test_data = get_test_data(dataset, simulation_data, n_test_data)
            
            # idx = np.arange(train_size) + n_test_data
            idx = np.arange(train_size)
            train_data = get_train_data(n_actions, train_size, simulation_data, idx, our_x)
            
            bpr_model = BPRModel(
                                 dataset["n_users"],
                                 dataset["n_actions"],
                                 dataset["emb_x"].shape[1], 
                                 torch.tensor(our_x, device='cpu'), 
                                 torch.tensor(our_a, device='cpu'))
            
            neighberhoodmodel = NeighborhoodModel(
                                                    train_data['x_idx'],
                                                    train_data['a'], 
                                                    our_a,
                                                    our_x, 
                                                    train_data['r'], 
                                                    num_neighbors=num_neighbors
                                                )
            

            model = CFModel(
                            n_users, 
                            n_actions, 
                            emb_dim, 
                            initial_user_embeddings=torch.tensor(our_x, device='cpu'), 
                            initial_actions_embeddings=torch.tensor(our_a, device='cpu')
                            )
            
            cf_dataset =  CustomCFDataset(
                                       train_data['x_idx'], 
                                       train_data['a'], 
                                       train_data['r'], 
                                       original_policy_prob[train_data['x_idx']]
                                       )
            
            train_loader = DataLoader(cf_dataset, batch_size=batch_size, shuffle=False)
            
            if first:
                policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
                conv_results.append(eval_policy(neighberhoodmodel, train_data, original_policy_prob[train_data['x_idx']], policy))
                conv_results[-1] = np.append(calc_reward(dataset, policy), conv_results[-1])
                conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_a-our_a)**2)), np.sqrt(np.mean((original_a-our_a)**2))])
                conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_x-our_x)**2)), np.sqrt(np.mean((original_x-our_x)**2))])
                bpr_scores = bpr_model.calc_scores(torch.tensor(train_data['x_idx'], device='cpu', dtype=torch.long)).detach().cpu().numpy()
                reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], bpr_scores)
                reg_results.append(reg_dm)
                first = False
                reg_results = np.array(reg_results)
                conv_results = np.array(conv_results)
                results[0] = get_opl_results_dict(reg_results, conv_results)
                reg_results, conv_results = [], []
            
            Bloss = BPRLoss()
            train(model, train_loader, neighberhoodmodel, num_epochs=num_epochs, lr=lr, device='cpu')
            fit_bpr(bpr_model, Bloss, train_loader, num_epochs=num_epochs, lr=lr, device='cpu')
            # neighborhood_model.update(model.get_params()[0].detach().numpy(), model.get_params()[1].detach().numpy())'

            our_a, our_x = model.get_params()
            our_a, our_x = our_a.detach().cpu().numpy(), our_x.detach().cpu().numpy()

            policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)

            # reg_dm = dm.estimate_policy_value(policy[test_data['x_idx']], regression_model.predict(test_data['x']))
            bpr_scores = bpr_model.calc_scores(torch.tensor(train_data['x_idx'], device='cpu', dtype=torch.long)).detach().cpu().numpy()
            reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], bpr_scores)

            reg_results.append(reg_dm)

            # conv_results.append(eval_policy(neighberhoodmodel, test_data, original_policy_prob[test_data['x_idx']], policy))
            conv_results.append(eval_policy(neighberhoodmodel, train_data, original_policy_prob[train_data['x_idx']], policy))

            conv_results[-1] = np.append(calc_reward(dataset, policy), conv_results[-1])
            conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_a-our_a)**2)), np.sqrt(np.mean((original_a-our_a)**2))])

            # temp.append(np.mean((emb_a-our_a)**2, axis=0))

            conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_x-our_x)**2)), np.sqrt(np.mean((original_x-our_x)**2))])
            
            our_a, our_x = original_a.copy(), original_x.copy()

        reg_results = np.array(reg_results)
        conv_results = np.array(conv_results)

        results[train_size] = get_opl_results_dict(reg_results, conv_results)
    
    return pd.DataFrame.from_dict(results, orient='index')

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [134]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [135]:
num_runs = 1

In [136]:
dataset_params = dict(
                    n_actions= 150,
                    n_users = 150,
                    emb_dim = 5,
                    # sigma = 0.1,
                    eps = 0.3 # this is the epsilon for the noise in the ground truth policy representation
                    )

train_dataset = generate_dataset(dataset_params)

In [137]:
num_runs = 1
batch_size = 50
num_neighbors = 6
num_rounds_list = [1, 2, 3, 4, 5, 10, 20]

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [138]:
df4 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, num_epochs=1, lr=0.005)

Epoch [1/1], Loss: -0.9694: 100%|██████████| 1/1 [00:00<00:00, 51.31it/s]
Epoch [1/1], Loss: 104.6335: 100%|██████████| 1/1 [00:00<00:00, 21.35it/s]
Epoch [1/1], Loss: -1.1042: 100%|██████████| 1/1 [00:00<00:00, 26.39it/s]
Epoch [1/1], Loss: 103.7447: 100%|██████████| 1/1 [00:00<00:00, 25.19it/s]
Epoch [1/1], Loss: -1.0640: 100%|██████████| 1/1 [00:00<00:00, 19.56it/s]
Epoch [1/1], Loss: 103.9204: 100%|██████████| 1/1 [00:00<00:00, 18.92it/s]
Epoch [1/1], Loss: -0.8721: 100%|██████████| 1/1 [00:00<00:00, 14.95it/s]
Epoch [1/1], Loss: 104.0008: 100%|██████████| 1/1 [00:00<00:00,  9.03it/s]
Epoch [1/1], Loss: -0.9946: 100%|██████████| 1/1 [00:00<00:00, 11.27it/s]
Epoch [1/1], Loss: 103.7537: 100%|██████████| 1/1 [00:00<00:00, 11.52it/s]
Epoch [1/1], Loss: -1.0438: 100%|██████████| 1/1 [00:00<00:00,  4.69it/s]
Epoch [1/1], Loss: 103.8728: 100%|██████████| 1/1 [00:00<00:00,  4.99it/s]
Epoch [1/1], Loss: -1.1014: 100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
Epoch [1/1], Loss: 103.9160: 100

In [139]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1815,0.0035,0.1541,0.0128,0.0317,0.0333,0.3386,0.0,0.5364,0.0
1,0.1452,0.0443,0.1382,0.0032,0.0211,0.0221,1.2613,1.0678,1.1922,1.0657
2,0.1453,0.0118,0.1383,0.0242,0.0361,0.0302,1.2626,1.0696,1.1944,1.0672
3,0.1453,0.0018,0.1382,0.0133,0.0149,0.0147,1.2653,1.0727,1.1965,1.069
4,0.1453,0.0008,0.1383,0.0104,0.002,0.0029,1.2672,1.075,1.2,1.0725
5,0.1452,0.0513,0.1382,0.0087,0.0509,0.0529,1.2689,1.0768,1.1997,1.072
10,0.1452,0.0229,0.1381,0.0065,0.0008,0.0006,1.2821,1.0921,1.2127,1.0857
20,0.1452,0.0098,0.1381,0.0034,0.0127,0.0124,1.3096,1.1244,1.2361,1.1148


### 2

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.001$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [None]:
df5 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, num_epochs=1, lr=0.001)

TypeError: DoublyRobust.estimate_policy_value() got multiple values for argument 'pscore'

In [None]:
df5

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1814,0.0215,0.0313,0.0228,0.0108,0.0128,0.0,0.0,0.0,0.0,0.0,0.4007,0.0,0.4486,0.0
1,0.1456,0.048,0.0132,0.0063,0.0409,0.0467,0.0,0.0,0.0,0.0,0.0,1.2246,1.0978,1.2253,1.0979
2,0.1456,0.0487,0.0025,0.0074,0.0387,0.0351,0.0,0.0,0.0,0.0,0.0,1.225,1.0983,1.2257,1.0982
3,0.1456,0.0546,0.0093,0.0041,0.0514,0.0499,0.0,0.0,0.0,0.0,0.0,1.2251,1.0986,1.2259,1.0985
4,0.1456,0.0243,0.0064,0.0051,0.0048,0.0053,0.0,0.0,0.0,0.0,0.0,1.2258,1.0993,1.2266,1.0992
5,0.1456,0.029,0.0089,0.0079,0.0172,0.0179,0.0,0.0,0.0,0.0,0.0,1.2263,1.0999,1.227,1.0996
10,0.1456,0.0043,0.0083,0.0086,0.0061,0.0061,0.0,0.0,0.0,0.0,0.0,1.2285,1.1025,1.2295,1.1023
20,0.1456,0.0134,0.0038,0.0015,0.0169,0.0164,0.0,0.0,0.0,0.0,0.0,1.2323,1.1077,1.233,1.1065


### 3

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.003$$
$$n_{epochs} = 10$$
$$BatchSize=50$$

In [None]:
df6 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, num_epochs=10, lr=0.003)

Epoch [10/10], Loss: -0.9765: 100%|██████████| 10/10 [00:00<00:00, 163.96it/s]
Epoch [10/10], Loss: -0.8647: 100%|██████████| 10/10 [00:00<00:00, 77.37it/s]
Epoch [10/10], Loss: -1.2434: 100%|██████████| 10/10 [00:00<00:00, 59.94it/s]
Epoch [10/10], Loss: -1.3606: 100%|██████████| 10/10 [00:00<00:00, 16.78it/s]
Epoch [10/10], Loss: -1.0743: 100%|██████████| 10/10 [00:00<00:00, 38.25it/s]
Epoch [10/10], Loss: -1.7894: 100%|██████████| 10/10 [00:00<00:00, 16.28it/s]
Epoch [10/10], Loss: -4.7742: 100%|██████████| 10/10 [00:01<00:00,  8.01it/s]


In [None]:
df6

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1814,0.0215,0.0313,0.0228,0.0108,0.0128,0.0,0.0,0.0,0.0,0.0,0.4007,0.0,0.4486,0.0
1,0.1454,0.0562,0.0129,0.0068,0.0443,0.0517,0.0,0.0,0.0,0.0,0.0,1.2389,1.1145,1.2475,1.1167
2,0.1457,0.0578,0.0016,0.0068,0.0473,0.0426,0.0,0.0,0.0,0.0,0.0,1.2544,1.1338,1.26,1.1312
3,0.1458,0.068,0.0103,0.0038,0.0654,0.0632,0.0,0.0,0.0,0.0,0.0,1.2676,1.1515,1.2751,1.1494
4,0.1456,0.029,0.0077,0.0056,0.0039,0.0046,0.0,0.0,0.0,0.0,0.0,1.2919,1.1806,1.2954,1.1743
5,0.1456,0.0399,0.009,0.0094,0.021,0.0223,0.0,0.0,0.0,0.0,0.0,1.3156,1.2095,1.3143,1.1948
10,0.1458,0.0435,0.0051,0.0048,0.0082,0.0077,0.0,0.0,0.0,0.0,0.0,1.4639,1.3886,1.4277,1.3326
20,0.1432,0.0033,0.003,0.0015,0.021,0.0184,0.0,0.0,0.0,0.0,0.0,1.9382,1.9374,1.6665,1.6388


### 4

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.05$$
$$n_{epochs} = 10$$
$$BatchSize=150$$

In [None]:
df7 = trainer_trial(num_runs, num_neighbors, num_rounds_list[:-3], train_dataset, batch_size+100, num_epochs=10, lr=0.05)

Epoch [10/10], Loss: -1.6573: 100%|██████████| 10/10 [00:00<00:00, 163.72it/s]
Epoch [10/10], Loss: -1.9076: 100%|██████████| 10/10 [00:00<00:00, 68.72it/s]
Epoch [10/10], Loss: -5.8958: 100%|██████████| 10/10 [00:00<00:00, 74.85it/s]
Epoch [10/10], Loss: -7.6978: 100%|██████████| 10/10 [00:00<00:00, 46.63it/s]


In [None]:
df7

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1814,0.0215,0.0313,0.0228,0.0108,0.0128,0.0,0.0,0.0,0.0,0.0,0.4007,0.0,0.4486,0.0
1,0.1434,0.1078,0.0174,0.0123,0.06,0.0972,0.0,0.0,0.0,0.0,0.0,1.4477,1.3507,1.4613,1.337
2,0.1466,0.1031,0.0007,0.0028,0.0412,0.0332,0.0,0.0,0.0,0.0,0.0,1.7244,1.6647,1.6791,1.5858
3,0.1444,0.0987,0.0024,0.0108,0.0138,0.0129,0.0,0.0,0.0,0.0,0.0,2.0512,2.0279,1.9344,1.8751
4,0.1464,0.0082,0.0066,0.0067,0.0616,0.0964,0.0,0.0,0.0,0.0,0.0,2.5031,2.5001,2.1634,2.1428
