Purpose of this notebook is to implement a simple linear contextual bandit from the TF research libary


*We create a recommendation system bandit problem as follows. The Jester Dataset (Goldberg et al., 2001) provides continuous ratings in [-10, 10] for 100 jokes from a total of 73421 users. We find a complete subset of n = 19181 users rating all 40 jokes. Following Riquelme et al. (2017), we take d = 32 of the ratings as the context of the user, and k = 8 as the arms. The agent recommends one joke, and obtains the reward corresponding to the rating of the user for the selected joke.*

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import time

import numpy as np
import pandas as pd
import tensorflow as tf

from absl import app
from absl import flags

from linear_bandit.sample_jester_data import sample_jester_data
from linear_bandit.sample_retail_data import sample_retail_data

from linear_bandit.contextual_bandit import run_contextual_bandit

from linear_bandit.linear_full_posterior_sampling import LinearFullPosteriorSampling
from linear_bandit.neural_bandit_model import NeuralBanditModel
from linear_bandit.neural_linear_sampling import NeuralLinearPosteriorSampling

from linear_bandit.bandit_algorithm import BanditAlgorithm
from linear_bandit.contextual_bandit import ContextualBandit

import matplotlib.pyplot as plt

data_route = '/Users/tmo/Data/bandits/'

tf.app.flags.DEFINE_string('f', '', 'kernel')

FLAGS = flags.FLAGS
FLAGS.set_default('alsologtostderr', True)
flags.DEFINE_string('logdir', data_route + 'logs/', 'Base directory to save output')

FLAGS(sys.argv)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


['/usr/local/anaconda3/envs/tf/lib/python3.6/site-packages/ipykernel_launcher.py']

In [19]:
sampled_vals = get_jester_data(2000)

In [20]:
dataset, opt_rewards, opt_actions, num_actions, context_dim = sampled_vals

In [21]:
dataset.shape

(2000, 40)

In [22]:
hparams_linear = tf.contrib.training.HParams(num_actions=num_actions, 
                                             context_dim=context_dim, 
                                             a0=6,
                                             b0=6,
                                             lambda_prior=0.25,
                                             initial_pulls=2)

linear_bandit = LinearFullPosteriorSampling(name='linear_bandit', hparams=hparams_linear)

In [23]:
hparams_nlinear = tf.contrib.training.HParams(num_actions=num_actions,
                                                context_dim=context_dim,
                                                init_scale=0.3,
                                                activation=tf.nn.relu,
                                                layer_sizes=[50],
                                                batch_size=512,
                                                activate_decay=True,
                                                initial_lr=0.1,
                                                max_grad_norm=5.0,
                                                show_training=False,
                                                freq_summary=1000,
                                                buffer_s=-1,
                                                initial_pulls=2,
                                                reset_lr=True,
                                                lr_decay_rate=0.5,
                                                training_freq=1,
                                                training_freq_network=50,
                                                training_epochs=100,
                                                a0=6,
                                                b0=6,
                                                lambda_prior=0.25, 
                                              verbose=False)


neural_bandit = NeuralLinearPosteriorSampling('neural_bandit', hparams_nlinear)

In [24]:
def run_bandit(model, hparams, plot=True, plot_freq=500):
        
    num_contexts = dataset.shape[0]
    
    h_actions = []
    h_rewards = []
    
    # Run the contextual bandit process
    for i in range(num_contexts):
        context = dataset[i, :context_dim] # Grab the ith line up until joke 32
        action = model.action(context) # Just one model with an action for the context
        reward = dataset[i, context_dim+action] # Grab the reward from the 8 possible rewards

        model.update(context, action, reward)

        h_actions.append(action)
        h_rewards.append(reward)
        
        if plot and model.t % plot_freq == 0:
            optimal_action_frequencies = [[elt, list(opt_actions).count(elt)] for elt in set(opt_actions)]
            model_action_frequencies = [[elt, list(h_actions).count(elt)] for elt in set(h_actions)]
            
            plot_optimal_model_actions(optimal_action_frequencies, 
                                       model_action_frequencies, 
                                       model.t)
            
        
    print('Optimal total reward = {}.'.format(np.sum(opt_rewards)))
    print('Total reward from bandit = {}.'.format(np.sum(h_rewards)))
    print('Reward ratio = {}'.format(np.sum(h_rewards)/np.sum(opt_rewards)))
        
    optimal_action_frequencies = [[elt, list(opt_actions).count(elt)] for elt in set(opt_actions)]
    model_action_frequencies = [[elt, list(h_actions).count(elt)] for elt in set(h_actions)]
    
    return optimal_action_frequencies, model_action_frequencies

In [27]:
%%timeit
oaf, maf = run_bandit(linear_bandit, hparams_linear, plot=False)

Optimal total reward = 11341.560000000001.
Total reward from bandit = 4533.889999999999.
Reward ratio = 0.39975893968730924
Optimal total reward = 11341.560000000001.
Total reward from bandit = 4735.59.
Reward ratio = 0.4175430893104652
Optimal total reward = 11341.560000000001.
Total reward from bandit = 5065.33.
Reward ratio = 0.44661669117828584
Optimal total reward = 11341.560000000001.
Total reward from bandit = 5061.74.
Reward ratio = 0.44630015623952957
Optimal total reward = 11341.560000000001.
Total reward from bandit = 5047.9400000000005.
Reward ratio = 0.4450833924080991
Optimal total reward = 11341.560000000001.
Total reward from bandit = 5270.5599999999995.
Reward ratio = 0.46471208546266995
Optimal total reward = 11341.560000000001.
Total reward from bandit = 5295.6.
Reward ratio = 0.46691989461767164
Optimal total reward = 11341.560000000001.
Total reward from bandit = 5188.59.
Reward ratio = 0.4574846846465565
19.2 s ± 3.42 s per loop (mean ± std. dev. of 7 runs, 1 loop