# Reinforcement Learning: An Introduction

### Exercise 2.8

k armed bandit problem trying to comapre different methods for tracking nonstationary problems.

In [1]:
import numpy as np



### Define Actors

In [82]:
# Bandit
np.random.seed(14)
B = np.random.rand(10)*3 - 1.5
print("Bandit rewards:", B)

def Bandit(arm, B):
    return B[arm]

Bandit rewards: [ 0.04183003  0.81949516  1.11128306 -1.47585915 -0.57079222  1.37281122
  0.03935014 -0.54514673  0.11759981 -0.83623517]


In [62]:
# UCB.
def UCB(Q, N, t, c):
    # Prevent division by zero.
    N_temp = np.clip(N, 0.0000001, None)
    
    A = Q + c*np.sqrt(np.log(t)/N_temp)
    return np.argmax(A)

In [74]:
# Gradient Bandit.
def GradientBandit(Q):
    e_Q = np.exp(Q)
    P_a = e_Q / np.sum(e_Q)
    return np.random.choice(k, p=P_a)

In [79]:
# Epsilon Greedy.
def EpsilonGreedy(Q, epsilon):
    if np.random.rand(1) > epsilon:
        # Exploit
        return np.argmax(Q)
    else:
        # Explore
        return np.random.choice(k)

In [80]:
# Greedy with optimisitic Q_0 is just EpsilonGreedy with different Initial Q values.
def OptimisiticInit(Q, Q_0):
    Q += Q_0
    return Q

### Run Simulation

In [163]:
# Run Simulation

# Setup.
k = 10
total_rewards = 0

epsilon = 1/128
alpha = 0.1

epochs = 500
runs = 1000


for e in range(epochs):
    epoch_rewards = 0
    Q = np.zeros(k)
    N = np.zeros(k, dtype=np.int32)
    for t in range(1, runs+1):
        a = EpsilonGreedy(Q, epsilon)
        r = Bandit(a, B)
        Q[a] = Q[a] + alpha* (r - Q[a])
        epoch_rewards += r
    total_rewards += epoch_rewards/runs
    
print(total_rewards/(epochs))

0.677021745594
