In [5]:
import copy
import matplotlib.pyplot as plt
import numpy as np

plt.style.use("seaborn-v0_8")

In [6]:
# Copying eps-greedy code from the previous notebook

class Environment:
    def __init__(self, thetas: [float]):
        self.thetas = thetas


class State:
    def __init__(self, n_arms: int):
        self.n_arms = n_arms
        self.counts = np.zeros(n_arms)
        self.values = np.zeros(n_arms)


class EpsilonGreedy:
    """Epsilon-greedy policy"""

    @classmethod
    def get_arm(cls, state: State, eps: float = 0.1) -> int:
        # Decide to either explore or exploit
        if np.random.random() < eps:
            arm = np.random.randint(state.n_arms)
        else:
            arm = np.argmax(state.values)
        return arm


class Oracle:
    """Oracle policy"""

    @classmethod
    def get_arm(cls, env: Environment) -> int:
        return np.argmax(env.thetas)


def react(env: Environment, arm: int) -> float:
    if np.random.random() < env.thetas[arm]:
        return 1.0
    else:
        return 0


def update(state: State, arm: int, reward: float) -> State:
    s = copy.deepcopy(state)
    s.counts[arm] += 1
    s.values[arm] = ((s.counts[arm] - 1) * s.values[arm] + reward) / s.counts[arm]
    return s

# Q1. Implement softmax policy

In [7]:
class Softmax:
    @classmethod
    def get_arm(cls, state: State, tau: float = 0.10) -> int:
        logit = state.values / tau
        raise NotImplementedError("Implement this line")  # p = ...
        arm = np.random.choice(state.n_arms, p=p)
        return arm

# Q2. Evaluate the Regrets

In [8]:
import matplotlib.pyplot as plt

thetas = [0.80, 0.50, 0.35, 0.60]
env = Environment(thetas)
num_trials = 1000

# Oracle
opt_rewards = []
for i in range(num_trials):
    arm = Oracle.get_arm(env)
    reward = react(env, arm)
    opt_rewards.append(reward)

# Eps-greedy.
eg_state = State(len(thetas))
eg_rewards = []
for i in range(num_trials):
    arm = EpsilonGreedy.get_arm(eg_state)
    reward = react(env, arm)
    eg_rewards.append(reward)
    eg_state = update(eg_state, arm, reward)
    raise NotImplementedError("Keep track of rewards here")

# Softmax
sm_state = State(len(thetas))
sm_rewards = []
for i in range(num_trials):
    arm = Softmax.get_arm(sm_state)
    reward = react(env, arm)
    sm_rewards.append(reward)
    sm_state = update(sm_state, arm, reward)
    raise NotImplementedError("Keep track of rewards here")

# Calculate regrets
raise NotImplementedError("Calculate regrets")


NotImplementedError: Keep track of rewards here