# Chapter 10

## 10.6

In [12]:
import cvxpy as cp
import numpy as np
import math

def UCBAnytime(bandit, n, seed=541):
    """
    Implementation of the asymptotically optimal UCB algorithm (algorithm 6 in Lattimore 2018) for 1/2-subgaussian arms
    -----
    bandit: the bandit instance
    n: time horizon
    """

    rng = np.random.default_rng(seed)
    
    K = bandit.K()

    reward_history = {}

    # pull each arm once
    arm_UCB = np.zeros(K)
    for a in range(K):
        reward_history[a] = [bandit.pull(a)]
        arm_UCB[a] = np.mean(reward_history[a]) + math.sqrt(math.log(1 + (a + 1) * (math.log(a + 1) ** 2)) / (2 * len(reward_history[a])))

    for t in range(K, n):
        arm_to_pull = np.argmax(arm_UCB)
        reward_history[arm_to_pull].append(bandit.pull(arm_to_pull))
        arm_UCB[arm_to_pull] = np.mean(reward_history[arm_to_pull]) + math.sqrt(math.log(1 + (t + 1) * (math.log(t + 1) ** 2)) / (2 * len(reward_history[a])))

def KLUCB(bandit, n, seed=541):
    """
    Implementation of the KL-UCB algorithm (algorithm 8 in Lattimore 2018)
    -----
    bandit: the bandit instance
    n: time horizon
    """

    rng = np.random.default_rng(seed)
    
    K = bandit.K()

    reward_history = {}

    # pull each arm once
    arm_UCB = np.zeros(K)
    for a in range(K):
        reward_history[a] = [bandit.pull(a)]

        # solve the for arm index
        past_mean = np.mean(reward_history[a])
        RHS_ratio = (math.log(1 + (a + 1) * (math.log(a + 1) ** 2)) / len(reward_history[a]))
        
        mu = cp.Variable()
        objective = cp.Maximize(mu)
        constraints = [
            mu >= 0, 
            mu <= 1, 
            past_mean * cp.log(past_mean) - past_mean * cp.log(mu) + (1 - past_mean) * cp.log(1 - past_mean) - (1 - past_mean) * cp.log(1 - mu) <= RHS_ratio
        ]
        prob = cp.Problem(objective, constraints)
        prob.solve()
        arm_UCB[a] = mu.value

    for t in range(K, n):
        arm_to_pull = np.argmax(arm_UCB)
        reward_history[arm_to_pull].append(bandit.pull(arm_to_pull))
        
        # solve the for arm index
        past_mean = np.mean(reward_history[arm_to_pull])
        RHS_ratio = (math.log(1 + (t + 1) * (math.log(t + 1) ** 2)) / len(reward_history[arm_to_pull]))

        mu = cp.Variable()
        objective = cp.Maximize(mu)
        constraints = [
            mu >= 0, 
            mu <= 1, 
            past_mean * cp.log(past_mean) - past_mean * cp.log(mu) + (1 - past_mean) * cp.log(1 - past_mean) - (1 - past_mean) * cp.log(1 - mu) <= RHS_ratio
        ]
        prob = cp.Problem(objective, constraints)
        prob.solve()
        arm_UCB[arm_to_pull] = mu.value

In [13]:
from bandit_instance import BernoulliBandit

step = 0.005
gaps = np.arange(0.01, 0.5 + step, step)
mu_1 = 0.5
n = 10000
N = 5000
Anytime_avg_regret_log = []
KL_avg_regret_log = []

for gap in gaps:
    Anytime_regret_log = []
    KL_regret_log = []
    for seed in range(N):
        bandit = BernoulliBandit([mu_1, mu_1+gap], seed)
        UCBAnytime(bandit, n, seed)
        Anytime_regret_log.append(bandit.regret()["pseudo"])

        bandit = BernoulliBandit([mu_1, mu_1+gap], seed)
        KLUCB(bandit, n, seed)
        KL_regret_log.append(bandit.regret()["pseudo"])

    Anytime_avg_regret_log.append(np.mean(Anytime_regret_log))
    KL_avg_regret_log.append(np.mean(KL_regret_log))
    # print(f"average regret is {avg_regret_log[-1]}")

KeyboardInterrupt: 

In [6]:
Anytime_regret_log

[99.98999999999978]