In [1]:
import warnings
warnings.filterwarnings("ignore")
from copy import deepcopy
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from sklearn.utils import check_random_state

# implementing OPE of the IPWLearner using synthetic bandit data
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

from scipy.special import softmax
from abc import ABCMeta


from obp.ope import (
    RegressionModel,
    DirectMethod as DM,
)

from my_utils import (
    eval_policy,
    generate_dataset,
    create_simluation_data_from_pi,
    get_train_data,
    CFModel,
    CustomCFDataset,
    NeighborhoodModel
)
random_state=12345
random_ = check_random_state(random_state)

In [2]:
def calc_reward(dataset, policy):
    return np.array([np.sum(dataset['q_x_a'] * policy.squeeze(), axis=1).mean()])

In [3]:
pd.options.display.float_format = '{:,.4f}'.format

In [4]:
class IPWPolicyLoss(nn.Module):
    def __init__(self, log_eps=1e-10):
        super(IPWPolicyLoss, self).__init__()
        self.log_eps = log_eps

    def forward(self, pscore, scores, policy_prob, original_policy_rewards, original_policy_actions):
        n = original_policy_actions.shape[0]

        pi_e_at_position = policy_prob[torch.arange(n), original_policy_actions].squeeze()
        iw = pi_e_at_position / pscore
        iw = iw.detach()
        # q_hat_at_position = scores[torch.arange(n), original_policy_actions].squeeze()
        # dm_grads = (scores * policy_prob.detach() * torch.log(policy_prob)).sum(dim=1)
        log_pi = torch.log(pi_e_at_position).squeeze()
        
        # reinforce trick step
        # reinforce_grad = ((iw * (original_policy_rewards - q_hat_at_position) * log_pi) / iw.sum()) + dm_grads
        reinforce_grad = iw * original_policy_rewards * log_pi
        
        return reinforce_grad.mean()

In [5]:
class SNDRPolicyLoss(nn.Module):
    def __init__(self, log_eps=1e-10):
        super(SNDRPolicyLoss, self).__init__()
        self.log_eps = log_eps

    def forward(self, pscore, scores, policy_prob, original_policy_rewards, original_policy_actions):
        n = original_policy_actions.shape[0]

        pi_e_at_position = policy_prob[torch.arange(n), original_policy_actions].squeeze()
        iw = pi_e_at_position / pscore
        iw = iw.detach()
        q_hat_at_position = scores[torch.arange(n), original_policy_actions].squeeze()
        dm_reward = (scores * policy_prob.detach()).sum(dim=1)
        log_pi = torch.log(pi_e_at_position).squeeze()
        
        # reinforce trick step
        r_hat = ((iw * (original_policy_rewards - q_hat_at_position)) / iw.sum()) + dm_reward
        reinforce_grad = r_hat * log_pi
        return reinforce_grad.mean()

In [9]:
# 4. Define the training function
def train(model, train_loader, neighborhood_model, num_epochs=1, lr=0.0001):

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr) # here we can change the learning rate
    criterion = SNDRPolicyLoss()

    model.train() # Set the model to training mode
    tq = tqdm(range(num_epochs))
    for epoch in tq:
        running_loss = 0.0
        total_samples = 0
        
        for user_idx, action_idx, rewards, original_prob in train_loader:
            # Move data to GPU if available
            if torch.cuda.is_available():
                user_idx = user_idx.to(device) 
                action_idx = action_idx.to(device)
                rewards = rewards.to(device)
                original_prob = original_prob.to(device) 
            
            # Forward pass
            policy = model(user_idx)
            pscore = original_prob[torch.arange(user_idx.shape[0]), action_idx.type(torch.long)]
            
            scores = torch.tensor(neighborhood_model.predict(user_idx.cpu().numpy()))
            
            loss = criterion(
                              pscore,
                              scores,
                              policy, 
                              rewards, 
                              action_idx.type(torch.long), 
                              )
            
            # Zero the gradients Backward pass and optimization
            optimizer.zero_grad()

            loss.backward()                        
            optimizer.step()
            
            # update neighborhood
            # action_emb, context_emb = model.get_params()
            
            # Calculate running loss and accuracy
            running_loss += loss.item()
            total_samples += 1

            # Print statistics after each epoch
            epoch_loss = running_loss / total_samples
            tq.set_description(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
            
        # neighborhood_model.update(action_emb.detach().numpy(), context_emb.detach().numpy())


In [10]:
def trainer_trial(
                  num_runs,
                  num_neighbors,
                  num_rounds_list,
                  dataset,
                  batch_size,
                  num_epochs,
                  lr=0.001
                  ):
    dm = DM()
    results = {}

    our_x, our_a = dataset["our_x"], dataset["our_a"]
    emb_x, emb_a = dataset["emb_x"], dataset["emb_a"]
    
    original_x, original_a = dataset["original_x"], dataset["original_a"]
    n_users, n_actions, emb_dim = dataset["n_users"], dataset["n_actions"], dataset["emb_dim"]
    first = True
    zero = True
    for train_size in num_rounds_list:
        reg_results, conv_results = [], []
        for run in range(num_runs):

            pi_0 = np.ones_like(dataset["q_x_a"])/(dataset["n_actions"])
            original_policy_prob = np.expand_dims(pi_0, -1)
            simulation_data = create_simluation_data_from_pi(
                                                            pi_0,
                                                            dataset["q_x_a"],
                                                            dataset["n_users"],
                                                            dataset["n_actions"],
                                                            random_state=train_size*(run+1)
                                                            )
            
            # test_data = get_test_data(dataset, simulation_data, n_test_data)
            
            # idx = np.arange(train_size) + n_test_data
            idx = np.arange(train_size)
            train_data = get_train_data(n_actions, train_size, simulation_data, idx, our_x)
            
            regression_model = RegressionModel(
                                                n_actions=n_actions,
                                                action_context=our_x,
                                                base_model=LogisticRegression(random_state=12345)
                                                )
            
            regression_model.fit(train_data['x'], 
                        train_data['a'],
                        train_data['r'],
                        original_policy_prob[train_data['x_idx'],
                        train_data['a']].squeeze()
                        )

            neighberhoodmodel = NeighborhoodModel(
                                                    train_data['x_idx'],
                                                    train_data['a'], 
                                                    our_a,
                                                    our_x, 
                                                    train_data['r'], 
                                                    num_neighbors=num_neighbors
                                                )
            

            model = CFModel(
                            n_users, 
                            n_actions, 
                            emb_dim, 
                            initial_user_embeddings=torch.tensor(our_x), 
                            initial_actions_embeddings=torch.tensor(our_a)
                            )
            
            cf_dataset =  CustomCFDataset(
                                       train_data['x_idx'], 
                                       train_data['a'], 
                                       train_data['r'], 
                                       original_policy_prob[train_data['x_idx']]
                                       )
            
            train_loader = DataLoader(cf_dataset, batch_size=batch_size, shuffle=False)
            if first:
                policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)
                conv_results.append(eval_policy(neighberhoodmodel, train_data, original_policy_prob[train_data['x_idx']], policy))
                conv_results[-1] = np.append(calc_reward(dataset, policy), conv_results[-1])
                conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_a-our_a)**2)), np.sqrt(np.mean((original_a-our_a)**2))])
                conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_x-our_x)**2)), np.sqrt(np.mean((original_x-our_x)**2))])
                reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], regression_model.predict(train_data['x']))
                reg_results.append(reg_dm)
                first = False
                reg_results = np.array(reg_results)
                conv_results = np.array(conv_results)
                results[0] = get_opl_results_dict(reg_results, conv_results)
                reg_results, conv_results = [], []
            
            train(model, train_loader, neighberhoodmodel, num_epochs=num_epochs, lr=lr)

            our_a, our_x = model.get_params()
            our_a, our_x = our_a.detach().cpu().numpy(), our_x.detach().cpu().numpy()

            policy = np.expand_dims(softmax(our_x @ our_a.T, axis=1), -1)

            # reg_dm = dm.estimate_policy_value(policy[test_data['x_idx']], regression_model.predict(test_data['x']))
            reg_dm = dm.estimate_policy_value(policy[train_data['x_idx']], regression_model.predict(train_data['x']))

            reg_results.append(reg_dm)

            # conv_results.append(eval_policy(neighberhoodmodel, test_data, original_policy_prob[test_data['x_idx']], policy))
            conv_results.append(eval_policy(neighberhoodmodel, train_data, original_policy_prob[train_data['x_idx']], policy))

            conv_results[-1] = np.append(calc_reward(dataset, policy), conv_results[-1])
            conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_a-our_a)**2)), np.sqrt(np.mean((original_a-our_a)**2))])

            # temp.append(np.mean((emb_a-our_a)**2, axis=0))

            conv_results[-1] = np.append(conv_results[-1], [np.sqrt(np.mean((emb_x-our_x)**2)), np.sqrt(np.mean((original_x-our_x)**2))])
            
            our_a, our_x = original_a.copy(), original_x.copy()

        reg_results = np.array(reg_results)
        conv_results = np.array(conv_results)

        results[train_size] = get_opl_results_dict(reg_results, conv_results)
    
    return pd.DataFrame.from_dict(results, orient='index')

## Learning

We will run several simulations on a generated dataset, the dataset is generated like this:
$$ \text{We have users U and actions A } u_i \sim N(0, I_{emb_dim}) \ a_i \sim N(0, I_{emb_dim})$$
$$ p_{ij} = 1 / (5 + e^{-(u_i.T a_j)}) $$
$$r_{ij} \sim Bin(p_{ij})$$

We have a policy $\pi$
and it's ground truth reward is calculated by
$$R_{gt} = \sum_{i}{\sum_{j}{\pi_{ij} * p_{ij}}} $$

Our parameters for the dataset will be
$$EmbDim = 5$$
$$NumActions= 150$$
$$NumUsers = 150$$
$$NeighborhoodSize = 6$$

to learn a new policy from $\pi$ we will sample from:
$$\pi_{start} = (1-\epsilon)*\pi + \epsilon * \pi_{random}$$

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cpu


In [None]:
num_runs = 1

In [8]:
def get_opl_results_dict(reg_results, conv_results):
    reward = conv_results[:, 0]
    return    dict(
                policy_rewards=np.mean(reward),
                ipw=np.mean(abs(conv_results[: ,3] - reward)),
                reg_dm=np.mean(abs(reg_results - reward)),
                conv_dm=np.mean(abs(conv_results[: ,1] - reward)),
                conv_dr=np.mean(abs(conv_results[: ,2] - reward)),
                conv_sndr=np.mean(abs(conv_results[: ,4] - reward)),

                ipw_var=np.var(conv_results[: ,3]),
                reg_dm_var=np.var(reg_results),
                conv_dm_var=np.var(conv_results[: ,1]),
                conv_dr_var=np.var(conv_results[: ,2]),
                conv_sndr_var=np.var(conv_results[: ,4]),

                                
                # ipw_p_err=np.mean(abs(conv_results[: ,3] - reward) / reward) * 100,
                # reg_dm_p_err=np.mean(abs(reg_results - reward) / reward) * 100,
                # conv_dm_p_err=np.mean(abs(conv_results[: ,1] - reward) / reward) * 100,
                # conv_dr_p_err=np.mean(abs(conv_results[: ,2] - reward) / reward) * 100,
                # conv_sndr_p_err=np.mean(abs(conv_results[: ,4] - reward) / reward) * 100,
                
                action_diff_to_real=np.mean(conv_results[: ,5]),
                action_delta=np.mean(conv_results[: ,6]),
                context_diff_to_real=np.mean(conv_results[: ,7]),
                context_delta=np.mean(conv_results[: ,8])
                )

In [27]:
dataset_params = dict(
                    n_actions= 150,
                    n_users = 150,
                    emb_dim = 5,
                    # sigma = 0.1,
                    eps = 0.3 # this is the epsilon for the noise in the ground truth policy representation
                    )

train_dataset = generate_dataset(dataset_params)

In [15]:
num_runs = 1
batch_size = 50
num_neighbors = 6
num_rounds_list = [1, 2, 3, 4, 5, 10, 20]

### 1

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.005$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [16]:
df4 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, num_epochs=1, lr=0.005)

Epoch [1/1], Loss: -0.9694: 100%|██████████| 1/1 [00:00<00:00,  7.99it/s]
Epoch [1/1], Loss: -1.1042: 100%|██████████| 1/1 [00:00<00:00, 34.57it/s]
Epoch [1/1], Loss: -1.0640: 100%|██████████| 1/1 [00:00<00:00, 20.51it/s]
Epoch [1/1], Loss: -0.8721: 100%|██████████| 1/1 [00:00<00:00, 16.93it/s]
Epoch [1/1], Loss: -0.9946: 100%|██████████| 1/1 [00:00<00:00, 10.74it/s]
Epoch [1/1], Loss: -1.0438: 100%|██████████| 1/1 [00:00<00:00,  6.43it/s]
Epoch [1/1], Loss: -1.1014: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


In [17]:
df4[['policy_rewards', 'ipw', 'reg_dm', 'conv_dm', 'conv_dr', 'conv_sndr', 'action_diff_to_real', 'action_delta', 'context_diff_to_real', 'context_delta']]

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1815,0.0035,0.0541,0.0128,0.0317,0.0333,0.3386,0.0,0.5364,0.0
1,0.1452,0.0443,0.0161,0.0032,0.0211,0.0221,1.2613,1.0678,1.1922,1.0657
2,0.1453,0.0118,0.0233,0.0242,0.0361,0.0302,1.2626,1.0696,1.1944,1.0672
3,0.1453,0.0018,0.0032,0.0133,0.0149,0.0147,1.2653,1.0727,1.1965,1.069
4,0.1453,0.0008,0.005,0.0104,0.002,0.0029,1.2672,1.075,1.2,1.0725
5,0.1452,0.0513,0.0068,0.0087,0.0509,0.0529,1.2689,1.0768,1.1997,1.072
10,0.1452,0.0229,0.0085,0.0065,0.0008,0.0006,1.2821,1.0921,1.2127,1.0857
20,0.1452,0.0098,0.0005,0.0034,0.0127,0.0124,1.3096,1.1244,1.2361,1.1148


### 2

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.001$$
$$n_{epochs} = 1$$
$$BatchSize=50$$

In [20]:
df5 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, num_epochs=1, lr=0.001)

Epoch [1/1], Loss: -0.9690: 100%|██████████| 1/1 [00:00<00:00, 57.10it/s]
Epoch [1/1], Loss: -1.1012: 100%|██████████| 1/1 [00:00<00:00, 29.97it/s]
Epoch [1/1], Loss: -1.0617: 100%|██████████| 1/1 [00:00<00:00, 22.95it/s]
Epoch [1/1], Loss: -0.8681: 100%|██████████| 1/1 [00:00<00:00, 17.00it/s]
Epoch [1/1], Loss: -0.9873: 100%|██████████| 1/1 [00:00<00:00, 11.62it/s]
Epoch [1/1], Loss: -1.0310: 100%|██████████| 1/1 [00:00<00:00,  5.76it/s]
Epoch [1/1], Loss: -1.0543: 100%|██████████| 1/1 [00:00<00:00,  2.93it/s]


In [21]:
df5

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1815,0.0035,0.0541,0.0128,0.0317,0.0333,0.0,0.0,0.0,0.0,0.0,0.3386,0.0,0.5364,0.0
1,0.1453,0.0442,0.016,0.0031,0.021,0.0218,0.0,0.0,0.0,0.0,0.0,1.2599,1.0661,1.1906,1.0657
2,0.1453,0.0123,0.0235,0.0241,0.037,0.0309,0.0,0.0,0.0,0.0,0.0,1.2601,1.0665,1.1911,1.066
3,0.1453,0.0027,0.0033,0.0132,0.0154,0.0151,0.0,0.0,0.0,0.0,0.0,1.2606,1.0671,1.1914,1.0663
4,0.1453,0.0002,0.005,0.0102,0.0026,0.0034,0.0,0.0,0.0,0.0,0.0,1.261,1.0675,1.1921,1.067
5,0.1453,0.0503,0.0066,0.0086,0.05,0.0517,0.0,0.0,0.0,0.0,0.0,1.2613,1.0679,1.1921,1.0669
10,0.1453,0.0201,0.0082,0.0064,0.0007,0.0006,0.0,0.0,0.0,0.0,0.0,1.2638,1.0708,1.1945,1.0694
20,0.1453,0.0089,0.0006,0.0035,0.0114,0.0112,0.0,0.0,0.0,0.0,0.0,1.2685,1.0763,1.1984,1.0743


### 3

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.003$$
$$n_{epochs} = 10$$
$$BatchSize=50$$

In [22]:
df6 = trainer_trial(num_runs, num_neighbors, num_rounds_list, train_dataset, batch_size, num_epochs=10, lr=0.003)

Epoch [10/10], Loss: -1.0324: 100%|██████████| 10/10 [00:00<00:00, 59.68it/s]
Epoch [10/10], Loss: -1.2277: 100%|██████████| 10/10 [00:00<00:00, 30.11it/s]
Epoch [10/10], Loss: -1.2147: 100%|██████████| 10/10 [00:00<00:00, 21.02it/s]
Epoch [10/10], Loss: -1.0216: 100%|██████████| 10/10 [00:00<00:00, 15.69it/s]
Epoch [10/10], Loss: -1.2476: 100%|██████████| 10/10 [00:00<00:00, 10.79it/s]
Epoch [10/10], Loss: -1.5742: 100%|██████████| 10/10 [00:01<00:00,  5.90it/s]
Epoch [10/10], Loss: -5.8588: 100%|██████████| 10/10 [00:03<00:00,  2.89it/s]


In [23]:
df6

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1815,0.0035,0.0541,0.0128,0.0317,0.0333,0.0,0.0,0.0,0.0,0.0,0.3386,0.0,0.5364,0.0
1,0.1451,0.0462,0.0162,0.0028,0.0221,0.024,0.0,0.0,0.0,0.0,0.0,1.273,1.0816,1.2032,1.0708
2,0.1453,0.0158,0.0232,0.0246,0.0427,0.0336,0.0,0.0,0.0,0.0,0.0,1.2871,1.0995,1.2186,1.0893
3,0.1453,0.0045,0.0036,0.0137,0.0115,0.0118,0.0,0.0,0.0,0.0,0.0,1.3095,1.1251,1.239,1.1124
4,0.1453,0.0006,0.0053,0.0117,0.001,0.0004,0.0,0.0,0.0,0.0,0.0,1.3261,1.1443,1.26,1.1368
5,0.1449,0.0336,0.0069,0.0085,0.0356,0.0389,0.0,0.0,0.0,0.0,0.0,1.3455,1.1664,1.2692,1.15
10,0.1448,0.0237,0.0039,0.0036,0.0004,0.0007,0.0,0.0,0.0,0.0,0.0,1.4982,1.3404,1.3652,1.2808
20,0.1476,0.0105,0.0151,0.0032,0.0068,0.0062,0.0,0.0,0.0,0.0,0.0,2.0346,1.9255,1.6285,1.644


### 4

$$emb = 0.7 * gt + 0.3 * noise$$
$$lr = 0.05$$
$$n_{epochs} = 10$$
$$BatchSize=150$$

In [24]:
df7 = trainer_trial(num_runs, num_neighbors, num_rounds_list[:-3], train_dataset, batch_size+100, num_epochs=10, lr=0.05)

Epoch [10/10], Loss: -1.7198: 100%|██████████| 10/10 [00:00<00:00, 88.73it/s]
Epoch [10/10], Loss: -3.0079: 100%|██████████| 10/10 [00:00<00:00, 46.00it/s]
Epoch [10/10], Loss: -4.7455: 100%|██████████| 10/10 [00:00<00:00, 32.24it/s]
Epoch [10/10], Loss: -5.4046: 100%|██████████| 10/10 [00:00<00:00, 23.10it/s]


In [25]:
df7

Unnamed: 0,policy_rewards,ipw,reg_dm,conv_dm,conv_dr,conv_sndr,ipw_var,reg_dm_var,conv_dm_var,conv_dr_var,conv_sndr_var,action_diff_to_real,action_delta,context_diff_to_real,context_delta
0,0.1815,0.0035,0.0541,0.0128,0.0317,0.0333,0.0,0.0,0.0,0.0,0.0,0.3386,0.0,0.5364,0.0
1,0.143,0.0497,0.0142,0.0009,0.0148,0.0294,0.0,0.0,0.0,0.0,0.0,1.4746,1.3141,1.3637,1.2353
2,0.1449,0.101,0.0281,0.0187,0.1998,0.1172,0.0,0.0,0.0,0.0,0.0,1.7363,1.6121,1.594,1.5158
3,0.145,0.1332,0.0102,0.0072,0.1406,0.116,0.0,0.0,0.0,0.0,0.0,2.1355,2.0325,1.8826,1.8401
4,0.1486,0.0274,0.0123,0.0127,0.0727,0.1267,0.0,0.0,0.0,0.0,0.0,2.5199,2.433,2.178,2.1835
