In [13]:
from sklearn.ensemble import RandomForestClassifier as RandomForest
from sklearn.linear_model import LogisticRegression
import sys
import numpy as np
import torch
#sys.path.append ('/Users/tanvikapoor/zr-obp')
#print(sys.path)

# import open bandit pipeline (obp)
import obp
from obp.dataset import (
    SyntheticBanditDataset,
    logistic_reward_function,
    linear_reward_function
)
from obp.policy import (
    IPWLearner, 
    QLearner,
    NNPolicyLearner, 
    Random
)

import loss_translation

## (1) Generating Synthetic Data
`obp.dataset.SyntheticBanditDataset` is an easy-to-use synthetic data generator.

It takes 
- number of actions (`n_actions`, $|\mathcal{A}|$)
- dimension of context vectors (`dim_context`, $d$)
- reward function (`reward_function`, $q(x,a)=\mathbb{E}[r|x,a]$)

as inputs and generates synthetic logged bandit data that can be used to evaluate the performance of decision making policies (obtained by `off-policy learning`).

In [14]:
# generate synthetic logged bandit data with 10 actions
# we use `logistic function` as the reward function and control the behavior policy with `beta`
# one can define their own reward function and behavior policy function such as nonlinear ones. 
dataset = SyntheticBanditDataset(
    n_actions=10,
    dim_context=5,
    beta=-2, # inverse temperature parameter to control the optimality and entropy of the behavior policy
    reward_type="binary", # "binary" or "continuous"
    reward_function=logistic_reward_function,
    random_state=12345,
)

In [15]:
# obtain training and test sets of synthetic logged bandit data
n_rounds_train, n_rounds_test = 10000, 10000
bandit_feedback_train = dataset.obtain_batch_bandit_feedback(n_rounds=int(0.8*n_rounds_train))
bandit_feedback_val = dataset.obtain_batch_bandit_feedback(n_rounds=int(0.2*n_rounds_train))
bandit_feedback_test = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds_test)

## (2) Off-Policy Learning
After generating synthetic data, we now train some decision making policies.

To train policies on logged bandit data, we use

- `obp.policy.NNPolicyLearner` (Neural Network Policy Learner)
- `obp.policy.IPWLearner`

For `NN Learner`, we use 
- Direct Method ("dm")
- InverseProbabilityWeighting ("ipw")
- DoublyRobust ("dr") 

as its objective functions (`off_policy_objective`). 

For `IPW Learner`, we use `RandomForestClassifier` and *LogisticRegression* implemented in scikit-learn for base ML methods.

A policy is trained by maximizing an OPE estimator as an objective function as follows.

$$ \hat{\pi} \in \arg \max_{\pi \in \Pi} \hat{V} (\pi; \mathcal{D}_{tr}) - \lambda \cdot \Omega (\pi)  $$

where $\hat{V}(\cdot; \mathcal{D})$ is an off-policy objective and $\mathcal{D}_{tr}$ is a training bandit dataset. $\Omega (\cdot)$ is a regularization term.

In [16]:
max_iters = [200, 400, 600]
random_states = [12345, 54321, 11111]

with open("evaluation.txt", "a") as o:

  for m in max_iters:
    for r in random_states:

        print(dataset)
        
        den_l = loss_translation.binarySearch (m, r, 0, 0.0001, dataset, bandit_feedback_train, 0, 100, 0.01, 1, 0.05)
        
        def black_box_function(l):
        #l: Hyperparameter to optimize for, which is l
          f = loss_translation.predict_value_ratio_train (m, r, 0, 0.0001, dataset, l, bandit_feedback_train)
          return f
          
        pbounds = {"l": [0, 1]}
        init_pnts = 15
        num_iter = 20
        (ratio_train_l, ratio_train_val) = loss_translation.bayesOpt(black_box_function, pbounds, init_pnts, num_iter)

        def black_box_function(l):
        # l: Hyperparameter to optimize for, which is l
          f = loss_translation.predict_value_ratio_val (m, r , 0, 0.0001, dataset, l, bandit_feedback_train, bandit_feedback_val)
          return f

        pbounds = {"l": [0, 1]}
        init_pnts = 15
        num_iter = 20
        (ratio_val_l, ratio_val_val) = loss_translation.bayesOpt(black_box_function, pbounds, init_pnts, num_iter)

        # Policy with no loss
        nn_ipw = loss_translation.generateModel(dataset, m,0,r, 0, 0.0001)

        nn_ipw.fit(
              context=bandit_feedback_train["context"],
              action=bandit_feedback_train["action"],
              reward=bandit_feedback_train["reward"],
              pscore=bandit_feedback_train["pscore"],
            )

        action_dist_nn_ipw_no_loss = nn_ipw.predict_proba(
              context=bandit_feedback_test["context"]
            )

      # Policy with denominator = 1 for training data
        nn_ipw = loss_translation.generateModel(dataset, m,den_l,r, 0, 0.0001)

        nn_ipw.fit(
              context=bandit_feedback_train["context"],
              action=bandit_feedback_train["action"],
              reward=bandit_feedback_train["reward"],
              pscore=bandit_feedback_train["pscore"],
            )

        action_dist_nn_ipw_den_1 = nn_ipw.predict_proba(
              context=bandit_feedback_test["context"]
            )

      # Policy with max ratio for training data
        nn_ipw = loss_translation.generateModel(dataset, m,ratio_train_l ["l"],r, 0, 0.0001)

        nn_ipw.fit(
              context=bandit_feedback_train["context"],
              action=bandit_feedback_train["action"],
              reward=bandit_feedback_train["reward"],
              pscore=bandit_feedback_train["pscore"],
            )

        action_dist_nn_ipw_max_train_ratio = nn_ipw.predict_proba(
              context=bandit_feedback_test["context"]
          )

      # Policy with max ratio for validation data
        nn_ipw = loss_translation.generateModel(dataset, m,ratio_val_l ["l"],r, 0, 0.0001)

        nn_ipw.fit(
              context=bandit_feedback_train["context"],
              action=bandit_feedback_train["action"],
              reward=bandit_feedback_train["reward"],
              pscore=bandit_feedback_train["pscore"],
            )

        action_dist_nn_ipw_max_val_ratio = nn_ipw.predict_proba(
              context=bandit_feedback_test["context"]
          )

        policy_names = [
        "NN Policy Learner with no loss",
        "NN Policy Learner with training denominator = 1",
        "NN Policy Learner with max training ratio",
        "NN Policy Learner with max validation ratio"
          ]
        action_dist_list = [
          action_dist_nn_ipw_no_loss,
          action_dist_nn_ipw_den_1,
          action_dist_nn_ipw_max_train_ratio,
          action_dist_nn_ipw_max_val_ratio
          ]

        with open("evaluation.txt", "a") as o:

          o.write("With param values max_iter: " + str(m) +" and random state: " + str(r) + "\n")

          for name, action_dist in zip(policy_names, action_dist_list):
            true_policy_value = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=action_dist,
            )
            o.write(f'policy value of {name}: {true_policy_value}')

o.close()

policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

SyntheticBanditDataset(n_actions=10, dim_context=5, reward_type='binary', reward_function=<function logistic_reward_function at 0x7fdd524998b0>, reward_std=1.0, action_context=array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]), behavior_policy_function=None, beta=-2, n_deficient_actions=0, random_state=12345, dataset_name='synthetic_bandit_dataset')
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:44<00:00,  4.51it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

0.771697911507026
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:43<00:00,  4.60it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.13it/s]

0.7704121494847396
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.15it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.24it/s]

0.7677255932108342
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.21it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.11it/s]

0.7714383906473012
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.17it/s]
policy learning:   0%|          | 1/200 [00:00<00:39,  5.07it/s]

0.7719953233253929
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:45<00:00,  4.42it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

0.8409593970288217
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:40<00:00,  4.98it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.17it/s]

0.9745351492043558
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.41it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 1       [0m | [0m 0.8492  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.30it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.33it/s]

| [0m 2       [0m | [0m 0.8395  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.57it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.22it/s]

| [0m 3       [0m | [0m 0.849   [0m | [0m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.49it/s]

| [95m 4       [0m | [95m 0.8526  [0m | [95m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.56it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.35it/s]

| [0m 5       [0m | [0m 0.8517  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.73it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.46it/s]

| [0m 6       [0m | [0m 0.8115  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.26it/s]

| [0m 7       [0m | [0m 0.8488  [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.34it/s]

| [0m 8       [0m | [0m 0.7963  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.36it/s]

| [0m 9       [0m | [0m 0.8146  [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.76it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 10      [0m | [0m 0.8324  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.52it/s]

| [95m 11      [0m | [95m 0.854   [0m | [95m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.65it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.25it/s]

| [0m 12      [0m | [0m 0.8101  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.39it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 13      [0m | [0m 0.852   [0m | [0m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.65it/s]
policy learning:   0%|          | 1/200 [00:00<00:39,  5.10it/s]

| [0m 14      [0m | [0m 0.8485  [0m | [0m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.8075  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.53it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8526  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 17      [0m | [95m 0.854   [0m | [95m 0.767   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.74it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 18      [0m | [0m 0.854   [0m | [0m 0.7691  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.56it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 19      [0m | [0m 0.854   [0m | [0m 0.7705  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 20      [0m | [95m 0.854   [0m | [95m 0.7709  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.74it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 21      [0m | [95m 0.854   [0m | [95m 0.771   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.74it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 22      [0m | [0m 0.854   [0m | [0m 0.772   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.21it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 23      [0m | [0m 0.854   [0m | [0m 0.7692  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.44it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 24      [0m | [95m 0.854   [0m | [95m 0.7743  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.80it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 25      [0m | [0m 0.854   [0m | [0m 0.7745  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.62it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 26      [0m | [0m 0.854   [0m | [0m 0.7746  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.46it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 27      [0m | [95m 0.854   [0m | [95m 0.7743  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.64it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 28      [0m | [0m 0.854   [0m | [0m 0.7744  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.75it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 29      [0m | [0m 0.854   [0m | [0m 0.7706  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 30      [0m | [0m 0.854   [0m | [0m 0.7751  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.68it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 31      [0m | [0m 0.854   [0m | [0m 0.7746  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.66it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 32      [0m | [0m 0.854   [0m | [0m 0.7709  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 33      [0m | [0m 0.854   [0m | [0m 0.7746  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.35it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.854   [0m | [0m 0.7714  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s]
policy learning:   0%|          | 1/200 [00:00<00:34,  5.72it/s]

| [0m 35      [0m | [0m 0.854   [0m | [0m 0.7747  [0m |
Best result: {'l': 0.7742531182900111}; f(x) = 0.8540112182111371.
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 1/200 [00:00<00:35,  5.53it/s]

| [0m 1       [0m | [0m 0.8288  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [02:51<00:00,  1.16it/s]
policy learning:   0%|          | 1/200 [00:00<00:34,  5.79it/s]

| [0m 2       [0m | [0m 0.8156  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [09:22<00:00,  2.81s/it]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 3       [0m | [95m 0.8293  [0m | [95m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.44it/s]
policy learning:   0%|          | 1/200 [00:00<00:35,  5.55it/s]

| [0m 4       [0m | [0m 0.817   [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.80it/s]
policy learning:   0%|          | 1/200 [00:00<00:35,  5.54it/s]

| [0m 5       [0m | [0m 0.8175  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.80it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.30it/s]

| [0m 6       [0m | [0m 0.7706  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.79it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.30it/s]

| [95m 7       [0m | [95m 0.8295  [0m | [95m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.73it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.17it/s]

| [0m 8       [0m | [0m 0.7644  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.73it/s]
policy learning:   0%|          | 1/200 [00:00<00:33,  5.89it/s]

| [0m 9       [0m | [0m 0.7722  [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:40<00:00,  4.95it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.31it/s]

| [0m 10      [0m | [0m 0.8019  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.32it/s]

| [0m 11      [0m | [0m 0.8177  [0m | [0m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.64it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 12      [0m | [0m 0.7697  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:40<00:00,  4.89it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.15it/s]

| [0m 13      [0m | [0m 0.8192  [0m | [0m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.24it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 14      [0m | [95m 0.83    [0m | [95m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.17it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.7684  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.29it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8295  [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.50it/s]


| [95m 17      [0m | [95m 0.831   [0m | [95m 1.0     [0m |
| [0m 18      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 19      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 20      [0m | [0m 0.831   [0m | [0m 1.0     [0m |


policy learning:   0%|          | 1/200 [00:00<00:36,  5.46it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]


| [0m 21      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 22      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 23      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 24      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 25      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 26      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 27      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 28      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 29      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 30      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 31      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 32      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 33      [0m | [0m 0.831   [0m | [0m 1.0     [0m |


policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
| [0m 35      [0m | [0m 0.831   [0m | [0m 1.0     [0m |
Best result: {'l': 1.0}; f(x) = 0.8309898267969767.
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.43it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.45it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.19it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.11it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.24it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.10it/s]

SyntheticBanditDataset(n_actions=10, dim_context=5, reward_type='binary', reward_function=<function logistic_reward_function at 0x7fdd524998b0>, reward_std=1.0, action_context=array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]), behavior_policy_function=None, beta=-2, n_deficient_actions=0, random_state=12345, dataset_name='synthetic_bandit_dataset')
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.30it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.36it/s]

0.7501746070542116
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.44it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.16it/s]

0.7487915583230071
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.20it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

0.7458433840249197
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.39it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.14it/s]

0.7448337764597577
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.38it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

0.7551473756911044
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.42it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

0.8300149010916543
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.58it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.51it/s]

0.9857803229488626
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.17it/s]

| [0m 1       [0m | [0m 0.8489  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.42it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.38it/s]

| [0m 2       [0m | [0m 0.8423  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.66it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 3       [0m | [0m 0.8483  [0m | [0m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.45it/s]

| [95m 4       [0m | [95m 0.8546  [0m | [95m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.66it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 5       [0m | [0m 0.8534  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.55it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.18it/s]

| [0m 6       [0m | [0m 0.8161  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 7       [0m | [0m 0.848   [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.33it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 8       [0m | [0m 0.7991  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:41<00:00,  4.77it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.19it/s]

| [0m 9       [0m | [0m 0.819   [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 10      [0m | [0m 0.8355  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.28it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.18it/s]

| [95m 11      [0m | [95m 0.8577  [0m | [95m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:40<00:00,  4.98it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.37it/s]

| [0m 12      [0m | [0m 0.8149  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.33it/s]

| [95m 13      [0m | [95m 0.8586  [0m | [95m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.43it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 14      [0m | [0m 0.8472  [0m | [0m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.58it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.8126  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.69it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8546  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:41<00:00,  4.80it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 17      [0m | [95m 0.8591  [0m | [95m 0.8315  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.57it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 18      [0m | [0m 0.859   [0m | [0m 0.8404  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.73it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 19      [0m | [0m 0.859   [0m | [0m 0.8281  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:39<00:00,  5.10it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 20      [0m | [0m 0.859   [0m | [0m 0.834   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 21      [0m | [0m 0.859   [0m | [0m 0.8328  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 22      [0m | [0m 0.859   [0m | [0m 0.8303  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 23      [0m | [0m 0.859   [0m | [0m 0.8352  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.72it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 24      [0m | [0m 0.859   [0m | [0m 0.8278  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:41<00:00,  4.79it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 25      [0m | [0m 0.859   [0m | [0m 0.8281  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.40it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 26      [0m | [0m 0.859   [0m | [0m 0.8323  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.42it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 27      [0m | [0m 0.859   [0m | [0m 0.8291  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.24it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 28      [0m | [0m 0.859   [0m | [0m 0.8329  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 29      [0m | [0m 0.859   [0m | [0m 0.8327  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.62it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 30      [0m | [0m 0.859   [0m | [0m 0.8289  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 31      [0m | [0m 0.859   [0m | [0m 0.833   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.67it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 32      [0m | [0m 0.859   [0m | [0m 0.8327  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 33      [0m | [0m 0.859   [0m | [0m 0.829   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.54it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.859   [0m | [0m 0.833   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.67it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.24it/s]

| [0m 35      [0m | [0m 0.859   [0m | [0m 0.8328  [0m |
Best result: {'l': 0.8314872756843112}; f(x) = 0.8590509404120681.
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.32it/s]

| [0m 1       [0m | [0m 0.8173  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.38it/s]

| [0m 2       [0m | [0m 0.8128  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.15it/s]

| [0m 3       [0m | [0m 0.8171  [0m | [0m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.60it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.20it/s]

| [95m 4       [0m | [95m 0.8176  [0m | [95m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.17it/s]

| [0m 5       [0m | [0m 0.8159  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.66it/s]
policy learning:   0%|          | 1/200 [00:00<00:39,  5.08it/s]

| [0m 6       [0m | [0m 0.7786  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.49it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 7       [0m | [0m 0.817   [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:39<00:00,  5.05it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 8       [0m | [0m 0.7761  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:39<00:00,  5.02it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 9       [0m | [0m 0.7803  [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.26it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.13it/s]

| [0m 10      [0m | [0m 0.803   [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.52it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.41it/s]

| [95m 11      [0m | [95m 0.823   [0m | [95m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.52it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 12      [0m | [0m 0.7781  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.60it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.15it/s]

| [95m 13      [0m | [95m 0.8254  [0m | [95m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.54it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.31it/s]

| [0m 14      [0m | [0m 0.8168  [0m | [0m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.55it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.7774  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.68it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8176  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 17      [0m | [95m 0.8256  [0m | [95m 0.8343  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.56it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 18      [0m | [0m 0.8255  [0m | [0m 0.8474  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.39it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 19      [0m | [95m 0.8256  [0m | [95m 0.8399  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 20      [0m | [0m 0.8255  [0m | [0m 0.8301  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 21      [0m | [95m 0.8256  [0m | [95m 0.8408  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.23it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 22      [0m | [0m 0.8256  [0m | [0m 0.8408  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.30it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 23      [0m | [0m 0.8256  [0m | [0m 0.8407  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.25it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 24      [0m | [0m 0.8256  [0m | [0m 0.8406  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.35it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 25      [0m | [0m 0.8256  [0m | [0m 0.8405  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 26      [0m | [0m 0.8256  [0m | [0m 0.8403  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.32it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 27      [0m | [0m 0.8256  [0m | [0m 0.8402  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 28      [0m | [0m 0.8255  [0m | [0m 0.8432  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.35it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 29      [0m | [0m 0.8256  [0m | [0m 0.8377  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.25it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 30      [0m | [0m 0.8256  [0m | [0m 0.838   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.48it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 31      [0m | [0m 0.8256  [0m | [0m 0.8381  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.39it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 32      [0m | [0m 0.8256  [0m | [0m 0.8382  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.69it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 33      [0m | [0m 0.8256  [0m | [0m 0.8383  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.68it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.8256  [0m | [0m 0.8385  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.54it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.46it/s]

| [0m 35      [0m | [0m 0.8256  [0m | [0m 0.8386  [0m |
Best result: {'l': 0.840788282220842}; f(x) = 0.8255967153202908.
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.68it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.32it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.44it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.43it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.25it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.62it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.31it/s]

SyntheticBanditDataset(n_actions=10, dim_context=5, reward_type='binary', reward_function=<function logistic_reward_function at 0x7fdd524998b0>, reward_std=1.0, action_context=array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]), behavior_policy_function=None, beta=-2, n_deficient_actions=0, random_state=12345, dataset_name='synthetic_bandit_dataset')
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.74it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.39it/s]

0.7517748038610049
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.66it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.31it/s]

0.7518739733662526
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.42it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.24it/s]

0.7487798316567015
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.64it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.16it/s]

0.7519757967130666
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.30it/s]

0.7981672056028496
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.67it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.34it/s]

0.8301080686143557
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.38it/s]

0.9835739624573727
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.64it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.45it/s]

| [0m 1       [0m | [0m 0.8475  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.75it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.24it/s]

| [0m 2       [0m | [0m 0.8394  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.57it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.38it/s]

| [0m 3       [0m | [0m 0.8473  [0m | [0m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.69it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 4       [0m | [95m 0.8532  [0m | [95m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.41it/s]

| [0m 5       [0m | [0m 0.8522  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.72it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.37it/s]

| [0m 6       [0m | [0m 0.8137  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.21it/s]
policy learning:   0%|          | 1/200 [00:00<00:39,  5.08it/s]

| [0m 7       [0m | [0m 0.8472  [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.21it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.14it/s]

| [0m 8       [0m | [0m 0.8002  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:40<00:00,  4.96it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 9       [0m | [0m 0.8157  [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.13it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 10      [0m | [0m 0.8331  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.51it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.18it/s]

| [95m 11      [0m | [95m 0.8558  [0m | [95m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.42it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.51it/s]

| [0m 12      [0m | [0m 0.8127  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.53it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.50it/s]

| [95m 13      [0m | [95m 0.8561  [0m | [95m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.53it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.20it/s]

| [0m 14      [0m | [0m 0.8469  [0m | [0m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.45it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.8107  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.63it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8532  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.40it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 17      [0m | [95m 0.8563  [0m | [95m 0.8265  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.35it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 18      [0m | [0m 0.8562  [0m | [0m 0.8436  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.41it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 19      [0m | [0m 0.8562  [0m | [0m 0.8074  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.69it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 20      [0m | [0m 0.8563  [0m | [0m 0.819   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.57it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 21      [0m | [0m 0.8562  [0m | [0m 0.8209  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.62it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 22      [0m | [0m 0.8563  [0m | [0m 0.8284  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.86it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 23      [0m | [0m 0.8562  [0m | [0m 0.829   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.71it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 24      [0m | [95m 0.8563  [0m | [95m 0.8164  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.72it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 25      [0m | [0m 0.8563  [0m | [0m 0.8158  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.60it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 26      [0m | [0m 0.8563  [0m | [0m 0.8171  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 27      [0m | [0m 0.8563  [0m | [0m 0.8167  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 28      [0m | [0m 0.8563  [0m | [0m 0.8165  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 29      [0m | [0m 0.8563  [0m | [0m 0.8159  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.41it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 30      [0m | [0m 0.8563  [0m | [0m 0.8193  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 31      [0m | [0m 0.8563  [0m | [0m 0.8132  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.38it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 32      [0m | [0m 0.8563  [0m | [0m 0.8123  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.39it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 33      [0m | [0m 0.8563  [0m | [0m 0.8122  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.8563  [0m | [0m 0.8117  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.30it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.11it/s]

| [0m 35      [0m | [0m 0.8563  [0m | [0m 0.8116  [0m |
Best result: {'l': 0.8163600254555555}; f(x) = 0.8562973805293275.
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.15it/s]

| [0m 1       [0m | [0m 0.8324  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.33it/s]
policy learning:   0%|          | 1/200 [00:00<00:36,  5.41it/s]

| [0m 2       [0m | [0m 0.8132  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.33it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.30it/s]

| [95m 3       [0m | [95m 0.8327  [0m | [95m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:36<00:00,  5.45it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.13it/s]

| [0m 4       [0m | [0m 0.8147  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.40it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 5       [0m | [0m 0.8134  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.29it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.24it/s]

| [0m 6       [0m | [0m 0.7795  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.33it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.26it/s]

| [95m 7       [0m | [95m 0.8328  [0m | [95m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.38it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.18it/s]

| [0m 8       [0m | [0m 0.7866  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:38<00:00,  5.14it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 9       [0m | [0m 0.7809  [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.35it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 10      [0m | [0m 0.8091  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.38it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.37it/s]

| [0m 11      [0m | [0m 0.8223  [0m | [0m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.28it/s]

| [0m 12      [0m | [0m 0.7791  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.34it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.27it/s]

| [0m 13      [0m | [0m 0.8268  [0m | [0m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.38it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.27it/s]

| [95m 14      [0m | [95m 0.833   [0m | [95m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.78    [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.33it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8329  [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.70it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [95m 17      [0m | [95m 0.8332  [0m | [95m 1.0     [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:35<00:00,  5.64it/s]


| [95m 18      [0m | [95m 0.8332  [0m | [95m 0.9958  [0m |
| [0m 19      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 20      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 21      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 22      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 23      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 24      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 25      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 26      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 27      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 28      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |
| [0m 29      [0m | [0m 0.8332  [0m | [0m 0.9958  [0m |


policy learning:   0%|          | 1/200 [00:00<00:37,  5.37it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:34<00:00,  5.76it/s]
policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 30      [0m | [0m 0.8332  [0m | [0m 1.0     [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.28it/s]


| [0m 31      [0m | [0m 0.8332  [0m | [0m 0.9954  [0m |
| [0m 32      [0m | [0m 0.8332  [0m | [0m 0.9954  [0m |
| [0m 33      [0m | [0m 0.8332  [0m | [0m 0.9954  [0m |


policy learning:   0%|          | 0/200 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.8332  [0m | [0m 0.9954  [0m |
| [0m 35      [0m | [0m 0.8332  [0m | [0m 0.9954  [0m |
Best result: {'l': 0.9958112656357565}; f(x) = 0.833221498344664.
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.29it/s]
policy learning:   0%|          | 1/200 [00:00<00:39,  5.05it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.39it/s]
policy learning:   0%|          | 1/200 [00:00<00:37,  5.26it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.36it/s]
policy learning:   0%|          | 1/200 [00:00<00:38,  5.14it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 200/200 [00:37<00:00,  5.37it/s]
policy learning:   0%|          | 1/400 [00:00<01:16,  5.24it/s]

SyntheticBanditDataset(n_actions=10, dim_context=5, reward_type='binary', reward_function=<function logistic_reward_function at 0x7fdd524998b0>, reward_std=1.0, action_context=array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]), behavior_policy_function=None, beta=-2, n_deficient_actions=0, random_state=12345, dataset_name='synthetic_bandit_dataset')
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.37it/s]
policy learning:   0%|          | 1/400 [00:00<01:17,  5.14it/s]

0.6843270402506461
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.36it/s]
policy learning:   0%|          | 1/400 [00:00<01:14,  5.33it/s]

0.6812404064935456
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.35it/s]
policy learning:   0%|          | 1/400 [00:00<01:17,  5.12it/s]

0.6749966840944641
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.41it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

0.6659334401529351
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.36it/s]
policy learning:   0%|          | 1/400 [00:00<01:15,  5.25it/s]

0.6997448966548544
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.36it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

0.7606271163691508
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.35it/s]
policy learning:   0%|          | 1/400 [00:00<01:14,  5.34it/s]

0.9914707787947077
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.35it/s]
policy learning:   0%|          | 1/400 [00:00<01:17,  5.17it/s]

| [0m 1       [0m | [0m 0.8786  [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.45it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 2       [0m | [0m 0.8654  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.45it/s]
policy learning:   0%|          | 1/400 [00:00<01:18,  5.07it/s]

| [0m 3       [0m | [0m 0.8782  [0m | [0m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.43it/s]
policy learning:   0%|          | 1/400 [00:00<01:15,  5.30it/s]

| [95m 4       [0m | [95m 0.8841  [0m | [95m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.42it/s]
policy learning:   0%|          | 1/400 [00:00<01:16,  5.22it/s]

| [0m 5       [0m | [0m 0.8825  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.43it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 6       [0m | [0m 0.8279  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.33it/s]
policy learning:   0%|          | 1/400 [00:00<01:17,  5.14it/s]

| [0m 7       [0m | [0m 0.878   [0m | [0m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.33it/s]
policy learning:   0%|          | 1/400 [00:00<01:16,  5.22it/s]

| [0m 8       [0m | [0m 0.8093  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:20<00:00,  4.99it/s]
policy learning:   0%|          | 1/400 [00:00<01:07,  5.92it/s]

| [0m 9       [0m | [0m 0.8346  [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [36:01<00:00,  5.40s/it]
policy learning:   0%|          | 1/400 [00:00<01:14,  5.36it/s]

| [0m 10      [0m | [0m 0.8555  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [04:01<00:00,  1.66it/s]
policy learning:   0%|          | 1/400 [00:00<01:17,  5.13it/s]

| [95m 11      [0m | [95m 0.8866  [0m | [95m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.42it/s]
policy learning:   0%|          | 1/400 [00:00<01:15,  5.26it/s]

| [0m 12      [0m | [0m 0.8249  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.44it/s]
policy learning:   0%|          | 1/400 [00:00<01:15,  5.26it/s]

| [0m 13      [0m | [0m 0.8832  [0m | [0m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [08:46<00:00,  1.32s/it]
policy learning:   0%|          | 1/400 [00:00<01:16,  5.25it/s]

| [0m 14      [0m | [0m 0.8775  [0m | [0m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:12<00:00,  5.54it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.8219  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.28it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8841  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:20<00:00,  4.94it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 17      [0m | [0m 0.8863  [0m | [0m 0.7614  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.30it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 18      [0m | [0m 0.8863  [0m | [0m 0.8021  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.68it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 19      [0m | [95m 0.8867  [0m | [95m 0.7847  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.56it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 20      [0m | [95m 0.8867  [0m | [95m 0.7856  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:12<00:00,  5.53it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 21      [0m | [0m 0.8867  [0m | [0m 0.7863  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.58it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 22      [0m | [95m 0.8867  [0m | [95m 0.7866  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.65it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 23      [0m | [95m 0.8867  [0m | [95m 0.7869  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.64it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 24      [0m | [95m 0.8867  [0m | [95m 0.7873  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:16<00:00,  5.21it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 25      [0m | [0m 0.8867  [0m | [0m 0.7876  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:18<00:00,  5.09it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 26      [0m | [0m 0.8867  [0m | [0m 0.7877  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.59it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 27      [0m | [0m 0.8867  [0m | [0m 0.7878  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.34it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 28      [0m | [0m 0.8867  [0m | [0m 0.7879  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.60it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 29      [0m | [0m 0.8867  [0m | [0m 0.7879  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.66it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 30      [0m | [0m 0.8867  [0m | [0m 0.7879  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:08<00:00,  5.80it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 31      [0m | [0m 0.8867  [0m | [0m 0.7879  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:08<00:00,  5.80it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 32      [0m | [0m 0.8867  [0m | [0m 0.788   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:08<00:00,  5.83it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 33      [0m | [0m 0.8867  [0m | [0m 0.7881  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.57it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 34      [0m | [0m 0.8867  [0m | [0m 0.7882  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.40it/s]
policy learning:   0%|          | 1/400 [00:00<01:16,  5.25it/s]

| [0m 35      [0m | [0m 0.8867  [0m | [0m 0.7882  [0m |
Best result: {'l': 0.7872862909592927}; f(x) = 0.8866788344241824.
|   iter    |  target   |     l     |
-------------------------------------
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:19<00:00,  5.05it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 1       [0m | [0m 0.836   [0m | [0m 0.967   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:18<00:00,  5.10it/s]
policy learning:   0%|          | 1/400 [00:00<01:15,  5.30it/s]

| [0m 2       [0m | [0m 0.8302  [0m | [0m 0.5472  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.64it/s]
policy learning:   0%|          | 1/400 [00:00<01:07,  5.88it/s]

| [95m 3       [0m | [95m 0.8362  [0m | [95m 0.9727  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.65it/s]
policy learning:   0%|          | 1/400 [00:00<01:08,  5.79it/s]

| [0m 4       [0m | [0m 0.8302  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:18<00:00,  5.11it/s]
policy learning:   0%|          | 1/400 [00:00<01:16,  5.23it/s]

| [0m 5       [0m | [0m 0.8297  [0m | [0m 0.6977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:17<00:00,  5.18it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 6       [0m | [0m 0.7743  [0m | [0m 0.2161  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:19<00:00,  5.05it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 7       [0m | [95m 0.8363  [0m | [95m 0.9763  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.48it/s]
policy learning:   0%|          | 1/400 [00:00<01:08,  5.81it/s]

| [0m 8       [0m | [0m 0.7595  [0m | [0m 0.00623 [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:10<00:00,  5.69it/s]
policy learning:   0%|          | 1/400 [00:00<01:08,  5.79it/s]

| [0m 9       [0m | [0m 0.783   [0m | [0m 0.253   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:20<00:00,  4.94it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 10      [0m | [0m 0.8035  [0m | [0m 0.4348  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:12<00:00,  5.53it/s]
policy learning:   0%|          | 1/400 [00:00<01:07,  5.91it/s]

| [95m 11      [0m | [95m 0.8387  [0m | [95m 0.7794  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:18<00:00,  5.06it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 12      [0m | [0m 0.7695  [0m | [0m 0.1977  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:29<00:00,  4.46it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 13      [0m | [0m 0.8312  [0m | [0m 0.863   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:17<00:00,  5.17it/s]
policy learning:   0%|          | 1/400 [00:00<01:14,  5.38it/s]

| [0m 14      [0m | [0m 0.8364  [0m | [0m 0.9834  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:14<00:00,  5.35it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 15      [0m | [0m 0.7657  [0m | [0m 0.1638  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:17<00:00,  5.16it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 16      [0m | [0m 0.8302  [0m | [0m 0.7148  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:16<00:00,  5.23it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 17      [0m | [0m 0.8258  [0m | [0m 0.6096  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.33it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 18      [0m | [95m 0.8401  [0m | [95m 0.809   [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.30it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 19      [0m | [0m 0.8398  [0m | [0m 0.7979  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.29it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 20      [0m | [0m 0.8391  [0m | [0m 0.8177  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:15<00:00,  5.32it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [95m 21      [0m | [95m 0.8401  [0m | [95m 0.8056  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:13<00:00,  5.44it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 22      [0m | [0m 0.8401  [0m | [0m 0.8056  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.56it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 23      [0m | [0m 0.8401  [0m | [0m 0.8058  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:11<00:00,  5.57it/s]


| [95m 24      [0m | [95m 0.8401  [0m | [95m 0.8059  [0m |


policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning: 100%|██████████| 400/400 [01:20<00:00,  5.00it/s]
policy learning:   0%|          | 0/400 [00:00<?, ?it/s]

| [0m 25      [0m | [0m 0.8401  [0m | [0m 0.8059  [0m |
[0 1 0 ... 0 0 0]
[-0.5  0.5 -0.5 ... -0.5 -0.5 -0.5]
Here


policy learning:  97%|█████████▋| 388/400 [01:17<00:02,  5.39it/s]

With param values max_iter: 200 and random state: 12345 <br> 

policy value of NN Policy Learner with no loss: 0.7386491310472931<br>
policy value of NN Policy Learner with training denominator = 1: 0.7747569465293875<br>
policy value of NN Policy Learner with max training ratio: 0.7748265861941667<br>
policy value of NN Policy Learner with max validation ratio: 0.7680523661659651<br>

With param values max_iter: 200 and random state: 54321 <br>

policy value of NN Policy Learner with no loss: 0.7426535159127408 <br>
policy value of NN Policy Learner with training denominator = 1: 0.7715344364047754 <br>
policy value of NN Policy Learner with max training ratio: 0.7731315919182911 <br>
policy value of NN Policy Learner with max validation ratio: 0.7732276796432124 <br>

With param values max_iter: 200 and random state: 11111<br>

policy value of NN Policy Learner with no loss: 0.7525011085808299<br>
policy value of NN Policy Learner with training denominator = 1: 0.7720505838029407<br>
policy value of NN Policy Learner with max training ratio: 0.7730628901592789<br>
policy value of NN Policy Learner with max validation ratio: 0.7668315147195144<br>

In [None]:
policy_reg_param = [0, 0.001]
learning_rate = [0.0001, 0.001]
random_states = [12345, 54321, 11111, 47801, 60737]

with open("validation_evaluation.txt", "a") as o:

  for p in policy_reg_param:
    for lr in learning_rate:
      for r in random_states:

          print(dataset)
          
          den_l = loss_translation.binarySearch (200, r, p, lr, dataset, bandit_feedback_train, 0, 100, 0.01, 1, 0.05)
          
          def black_box_function(l):
          #l: Hyperparameter to optimize for, which is l
            f = loss_translation.predict_value_ratio_train (200, r, p, lr, dataset, l, bandit_feedback_train)
            return f
            
          pbounds = {"l": [0, 1]}
          init_pnts = 15
          num_iter = 20
          (ratio_train_l, ratio_train_val) = loss_translation.bayesOpt(black_box_function, pbounds, init_pnts, num_iter)

          def black_box_function(l):
          # l: Hyperparameter to optimize for, which is l
            f = loss_translation.predict_value_ratio_val (200, r , p, lr, dataset, l, bandit_feedback_train, bandit_feedback_val)
            return f

          pbounds = {"l": [0, 1]}
          init_pnts = 15
          num_iter = 20
          (ratio_val_l, ratio_val_val) = loss_translation.bayesOpt(black_box_function, pbounds, init_pnts, num_iter)

          # Policy with no loss
          nn_ipw = loss_translation.generateModel(dataset, 200, 0, r, p, lr)

          nn_ipw.fit(
                context=bandit_feedback_train["context"],
                action=bandit_feedback_train["action"],
                reward=bandit_feedback_train["reward"],
                pscore=bandit_feedback_train["pscore"],
              )

          action_dist_nn_ipw_no_loss = nn_ipw.predict_proba(
                context=bandit_feedback_val["context"]
              )

        # Policy with denominator = 1 for training data
          nn_ipw = loss_translation.generateModel(dataset, 200, den_l, r, p, lr)

          nn_ipw.fit(
                context=bandit_feedback_train["context"],
                action=bandit_feedback_train["action"],
                reward=bandit_feedback_train["reward"],
                pscore=bandit_feedback_train["pscore"],
              )

          action_dist_nn_ipw_den_1 = nn_ipw.predict_proba(
                context=bandit_feedback_val["context"]
              )

        # Policy with max ratio for training data
          nn_ipw = loss_translation.generateModel(dataset, 200, ratio_train_l ["l"], r, p, lr)

          nn_ipw.fit(
                context=bandit_feedback_train["context"],
                action=bandit_feedback_train["action"],
                reward=bandit_feedback_train["reward"],
                pscore=bandit_feedback_train["pscore"],
              )

          action_dist_nn_ipw_max_train_ratio = nn_ipw.predict_proba(
                context=bandit_feedback_val["context"]
            )

        # Policy with max ratio for validation data
          nn_ipw = loss_translation.generateModel(dataset, 200, ratio_val_l ["l"], r, p, lr)

          nn_ipw.fit(
                context=bandit_feedback_train["context"],
                action=bandit_feedback_train["action"],
                reward=bandit_feedback_train["reward"],
                pscore=bandit_feedback_train["pscore"],
              )

          action_dist_nn_ipw_max_val_ratio = nn_ipw.predict_proba(
                context=bandit_feedback_val["context"]
            )

          policy_names = [
          "NN Policy Learner with no loss",
          "NN Policy Learner with training denominator = 1",
          "NN Policy Learner with max training ratio",
          "NN Policy Learner with max validation ratio"
            ]
          action_dist_list = [
            action_dist_nn_ipw_no_loss,
            action_dist_nn_ipw_den_1,
            action_dist_nn_ipw_max_train_ratio,
            action_dist_nn_ipw_max_val_ratio
            ]

          o.write("With param values policy regularization: " + str(p) +" and learning rate: " + str(lr) + " and random state: " + str(r) + "\n")

          for name, action_dist in zip(policy_names, action_dist_list):
            true_policy_value = dataset.calc_ground_truth_policy_value(
            expected_reward=bandit_feedback_test["expected_reward"],
            action_dist=action_dist,
            )
            o.write(f'policy value of {name}: {true_policy_value}')

o.close()