In [1]:
import numpy as np

In [2]:
# Class for a single slot machine. Rewards are Gaussian.
class GaussianBandit(object):
    def __init__(self, mean=0, stdev=1):
        self.mean = mean
        self.stdev = stdev
    
    def pull_lever(self):
        reward = np.random.normal(self.mean, self.stdev)
        return np.round(reward, 1)

In [3]:
g1 = GaussianBandit(5, 2)
g2 = GaussianBandit(6, 2)
g3 = GaussianBandit(1, 5)

In [8]:
g2.pull_lever()

6.3

In [9]:
class GaussianBanditGame(object):
    def __init__(self, bandits):
        self.bandits = bandits
        np.random.shuffle(self.bandits)
        self.reset_game()
    
    def play(self, choice):
        reward = self.bandits[choice - 1].pull_lever()
        self.rewards.append(reward)
        self.total_reward += reward
        self.n_played += 1
        return reward
    
    def user_play(self):
        self.reset_game()
        print("Game started. " + 
              "Enter 0 as input to end the game.")
        while True:
            print(f"\n -- Round {self.n_played}")
            choice = int(input(f"Choose a machine " + 
                     f"from 1 to {len(self.bandits)}: "))
            if choice in range(1, len(self.bandits) + 1):
                reward = self.play(choice)
                print(f"Machine {choice} gave " + 
                      f"a reward of {reward}.")
                avg_rew = self.total_reward/self.n_played
                print(f"Your average reward " +
                      f"so far is {avg_rew}.")
            else:
                break
        print("Game has ended.")
        if self.n_played > 0:
            print(f"Total reward is {self.total_reward}" + 
                  f" after {self.n_played} round(s).")
            avg_rew = self.total_reward/self.n_played
            print(f"Average reward is {avg_rew}.")              
            
    def reset_game(self):
        self.rewards = []
        self.total_reward = 0
        self.n_played = 0

In [10]:
slotA = GaussianBandit(5, 3)
slotB = GaussianBandit(6, 2)
slotC = GaussianBandit(1, 5)
game = GaussianBanditGame([slotA, slotB, slotC])

In [11]:
game.user_play()

Game started. Enter 0 as input to end the game.

 -- Round 0
Choose a machine from 1 to 3: 1
Machine 1 gave a reward of 10.6.
Your average reward so far is 10.6.

 -- Round 1
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 6.5.
Your average reward so far is 8.55.

 -- Round 2
Choose a machine from 1 to 3: 3
Machine 3 gave a reward of -5.3.
Your average reward so far is 3.9333333333333336.

 -- Round 3
Choose a machine from 1 to 3: 1
Machine 1 gave a reward of 2.8.
Your average reward so far is 3.6500000000000004.

 -- Round 4
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 6.0.
Your average reward so far is 4.12.

 -- Round 5
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 5.5.
Your average reward so far is 4.3500000000000005.

 -- Round 6
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 6.8.
Your average reward so far is 4.7.

 -- Round 7
Choose a machine from 1 to 3: 1
Machine 1 gave a reward of 5.9.
Your average reward so far is 4.85.

 -- 

In [12]:
game.user_play()

Game started. Enter 0 as input to end the game.

 -- Round 0
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 8.0.
Your average reward so far is 8.0.

 -- Round 1
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 6.2.
Your average reward so far is 7.1.

 -- Round 2
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 5.1.
Your average reward so far is 6.433333333333333.

 -- Round 3
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 5.7.
Your average reward so far is 6.249999999999999.

 -- Round 4
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 7.5.
Your average reward so far is 6.5.

 -- Round 5
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 0.1.
Your average reward so far is 5.433333333333334.

 -- Round 6
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 5.4.
Your average reward so far is 5.428571428571429.

 -- Round 7
Choose a machine from 1 to 3: 2
Machine 2 gave a reward of 7.1.
Your average reward so far is 5.637

In [13]:
game.bandits[1].mean

6

# Online Ads

In [14]:
class BernoulliBandit(object):
    def __init__(self, p):
        self.p = p
    
    def display_ad(self):
        reward = np.random.binomial(n=1, p=self.p)
        return reward

In [15]:
adA = BernoulliBandit(0.004)
adB = BernoulliBandit(0.016)
adC = BernoulliBandit(0.02)
adD = BernoulliBandit(0.028)
adE = BernoulliBandit(0.031)

In [16]:
ads = [adA, adB, adC, adD, adE]

## A/B

In [None]:
n_test = 10000
n_prod = 90000
n_ads = len(ads)
Q = np.zeros(n_ads)  # Q, action values
N = np.zeros(n_ads)  # N, total impressions
total_reward = 0
avg_rewards = []  # Save average rewards over time

In [None]:
# A/B/n test
for i in range(n_test):
    ad_chosen = np.random.randint(n_ads)
    R = ads[ad_chosen].display_ad()  # Observe reward
    N[ad_chosen] += 1
    Q[ad_chosen] += (1 / N[ad_chosen]) * (R - Q[ad_chosen])
    total_reward += R
    avg_reward_so_far = total_reward / (i + 1)
    avg_rewards.append(avg_reward_so_far)

In [None]:
best_ad_index = np.argmax(Q)  # Find the best action
print("The best performing ad is {}".format(chr(ord('A') + best_ad_index)))

In [None]:
ad_chosen = best_ad_index
for i in range(n_prod):
    R = ads[ad_chosen].display_ad()
    total_reward += R
    avg_reward_so_far = total_reward / (n_test + i + 1)
    avg_rewards.append(avg_reward_so_far)

In [None]:
import pandas as pd
df_reward_comparison = pd.DataFrame(avg_rewards, columns=['A/B/n'])

In [None]:
import cufflinks as cf
import plotly.offline
cf.go_offline()
cf.set_config_file(world_readable=True, theme="white")

df_reward_comparison['A/B/n'].iplot(title="A/B/n Test Avg. Reward: {:.4f}"
                                    .format(avg_reward_so_far),
                                    xTitle='Impressions', 
                                    yTitle='Avg. Reward')

## eps greedy

In [None]:
eps = 0.1
n_prod = 100000
n_ads = len(ads)
Q = np.zeros(n_ads)
N = np.zeros(n_ads)
total_reward = 0
avg_rewards = []

In [None]:
ad_chosen = np.random.randint(n_ads)
for i in range(n_prod):
    R = ads[ad_chosen].display_ad()
    N[ad_chosen] += 1
    Q[ad_chosen] += (1 / N[ad_chosen]) * (R - Q[ad_chosen])
    total_reward += R
    avg_reward_so_far = total_reward / (i + 1)
    avg_rewards.append(avg_reward_so_far)
    
    # Select the next ad to display
    if np.random.uniform() <= eps:
        ad_chosen = np.random.randint(n_ads)
    else:
        ad_chosen = np.argmax(Q)

df_reward_comparison['e-greedy: {}'.format(eps)] = avg_rewards

In [None]:
greedy_list = [ 'e-greedy: 0.1']
df_reward_comparison[greedy_list].iplot(title="ε-Greedy Actions",
                                    dash = ['solid'], #, 'dash', 'dashdot', 'dot'],
                                    xTitle='Impressions', 
                                    yTitle='Avg. Reward')

## UCB

In [None]:
c = 0.1
n_prod = 100000
n_ads = len(ads)
ad_indices = np.array(range(n_ads))
Q = np.zeros(n_ads)
N = np.zeros(n_ads)
total_reward = 0
avg_rewards = []

In [None]:
for t in range(1, n_prod + 1):
    if any(N==0):
        ad_chosen = np.random.choice(ad_indices[N==0])
    else:
        uncertainty = np.sqrt(np.log(t) / N)
        ad_chosen = np.argmax(Q +  c * uncertainty)
        
    R = ads[ad_chosen].display_ad()
    N[ad_chosen] += 1
    Q[ad_chosen] += (1 / N[ad_chosen]) * (R - Q[ad_chosen])
    total_reward += R
    avg_reward_so_far = total_reward / t
    avg_rewards.append(avg_reward_so_far)
    
df_reward_comparison['UCB, c={}'.format(c)] = avg_rewards

In [None]:
# Need to run UCB with the specified values or modify the ucb_list to what you have.
ucb_list = ['UCB, c=0.1', 'UCB, c=1', 'UCB, c=10']
best_reward = df_reward_comparison.loc[t-1,ucb_list].max()
df_reward_comparison[ucb_list].iplot(title="Action Selection using UCB. Best avg. reward: {:.4f}"
                                    .format(best_reward),
                                    dash = ['solid', 'dash', 'dashdot'],
                                    xTitle='Impressions', 
                                    yTitle='Avg. Reward')

## TS

In [None]:
n_prod = 100000
n_ads = len(ads)
alphas = np.ones(n_ads)
betas = np.ones(n_ads)
total_reward = 0
avg_rewards = []

In [None]:
for i in range(n_prod):
    theta_samples = [np.random.beta(alphas[k], betas[k]) for k in range(n_ads)]
    ad_chosen = np.argmax(theta_samples)
    R = ads[ad_chosen].display_ad()
    alphas[ad_chosen] += R
    betas[ad_chosen] += 1 - R
    total_reward += R
    avg_reward_so_far = total_reward / (i + 1)
    avg_rewards.append(avg_reward_so_far)
df_reward_comparison['Thompson Sampling'] = avg_rewards

In [None]:
df_reward_comparison['Thompson Sampling'].iplot(title="Thompson Sampling Avg. Reward: {:.4f}"
                                    .format(avg_reward_so_far),
                                    xTitle='Impressions', 
                                    yTitle='Avg. Reward')