# K-armed Bandit

In [None]:
import numpy as np
import pandas as pd
import gym
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# K-armed bandit environment class

In [None]:
class KArmedBandit(gym.Env):
    """Creates a K-armed bandit environment. You specify the 
    number of arms you want and the environment is setup with
    random (normally distributed) reward distributions for each 
    arm.
    """
    
    def __init__(self, n_arms):
        self.n_arms = n_arms        
        self.m_low, self.m_high = 0, 5
        self.s_low, self.s_high = 0.1, 0.2
        
        self.distributions, self.means, self.stds = self.get_distributions()
        self.action_space = gym.spaces.Discrete(n_arms)
        self.observation_space = gym.spaces.Discrete(1)  # there's only one state
    
    def env_start(self, seed):
        np.random.seed(seed)  # set random seed for reward generation process
        return self.observation_space.sample()  # select random sample for start
        
    def step(self, action):
        reward = self.distributions[action]()
        observation, terminal, info = None, False, dict()
        return observation, reward, terminal, info
    
    def reset(self):
        self.distributions, self.means, self.stds = self.get_distributions()
    
    def render(self):
        x = np.linspace(self.m_low - 2*self.s_high, self.m_high + 2*self.s_high, 500)
        fig, ax = plt.subplots(figsize=(10, 5))
        for m, s in zip(self.means, self.stds):
            ax.plot(x, norm.pdf(x, m, s), label="mean: {:.2f}, std: {:.2f}".format(m, s))
        ax.legend()
    
    def get_distributions(self):
        np.random.seed(42)
        means = np.random.uniform(low=self.m_low, high=self.m_high, size=self.n_arms)
        stds = np.random.uniform(low=self.s_low, high=self.s_high, size=self.n_arms)
        distributions = [lambda mi=mi, si=si: np.random.normal(mi, si) for mi, si in zip(means, stds)]
        return distributions, means, stds

# Agent class

In [None]:
class Agent:
    """Creates an agent that can interact with the K-armed bandit
    environment. You have to specify the number of arms present
    in the environment class and you have to configure some learning
    parameters.
    """
    
    def __init__(self):
        self.last_action = None
        self.num_actions = None
        self.q_values = None
        self.step_size = None
        self.epsilon = None
        self.initial_value = 0.0

    def agent_init(self, agent_setup):
        self.n_actions = agent_setup['n_actions']
        self.initial_value = agent_setup["initial_value"]
        self.q_values = np.ones(self.n_actions) * self.initial_value
        self.step_size = agent_setup['step_size']
        self.epsilon = agent_setup['epsilon']

        self.last_action = 0
        
    @staticmethod
    def argmax(q_values):
        max_value = np.max(q_values)
        max_indices = np.where(q_values == max_value)[0]
        return np.random.choice(max_indices)
        
    def choose_action(self, observation):
        if np.random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.n_actions)
        else:
            action = self.argmax(self.q_values)
        return action

    def agent_start(self, observation):
        self.last_action = self.choose_action(observation)
        return self.last_action

    def agent_step(self, reward, observation):
        self.q_values[self.last_action] += self.step_size * (reward - self.q_values[self.last_action])
        self.last_action = self.choose_action(observation)
        return self.last_action

    def agent_end(self, reward):
        self.q_values[self.last_action] += self.step_size * (reward - self.q_values[self.last_action])

    def agent_cleanup(self):
        pass

    def agent_message(self, message):
        pass

# Run experiment function

In [None]:
def run_experiment(env_init, agent_init, n_episodes=20, steps_per_episode=2000):
    """Runs an experiment for the specified number of episodes where
    each episode runs a specified number of steps.
    """
    all_rewards = []
    all_q_value_estimates = []
    for e in tqdm(range(n_episodes)):
        rewards = []
        env = KArmedBandit(**env_init)
        observation = env.env_start(e)
        agent = Agent()
        agent.agent_init(agent_init)
        action = agent.agent_start(observation)
        observation, reward, terminal, info = env.step(action)
        for step in range(steps_per_episode):
            action = agent.agent_step(reward, observation)
            observation, reward, terminal, info = env.step(action)
            rewards.append(reward)
        agent.agent_end(reward)
        all_rewards.append(rewards)
        all_q_value_estimates.append(agent.q_values)
        
    return pd.DataFrame(np.array(all_rewards).T), np.mean(np.array(all_q_value_estimates), axis=0)

# Experiment

Let's run the multiple experiments with different hyperparameters and compare. The experiments we run:

1. optimistic initial value (10), mostly exploiting (epsilon=0.01), slow learner (step_size=0.01)
2. optimistic initial value (10), more exploration (epsilon=0.1), fast learner (step_size=0.1)
3. no optimistic initial value (0), mostly exploiting (epsilon=0.01), fast learner (step_size=0.1)
4. no optimistic initial value (0), more exploration (epsilon=0.1), fast learner (step_size=0.1)

In [None]:
n_arms = 4
env_setup = {'n_arms': n_arms}
env = KArmedBandit(**env_setup)
env.render()

In [None]:
results = []
history, agent1 = run_experiment(env_setup, {'n_actions': n_arms, 'initial_value': 10, 'step_size': 0.01, 'epsilon': 0.01})
results.append(history)
history, agent2 = run_experiment(env_setup, {'n_actions': n_arms, 'initial_value': 10, 'step_size': 0.1, 'epsilon': 0.1})
results.append(history)
history, agent3 = run_experiment(env_setup, {'n_actions': n_arms, 'initial_value': 0, 'step_size': 0.1, 'epsilon': 0.01})
results.append(history)
history, agent4 = run_experiment(env_setup, {'n_actions': n_arms, 'initial_value': 0, 'step_size': 0.1, 'epsilon': 0.1})
results.append(history)

# Visualize results

In [None]:
def aggregate_result(df_rewards):
    window = 50
    average_reward = df_rewards.rolling(window=window).mean().mean(axis=1)
    spread = df_rewards.rolling(window=window).mean().std(axis=1)
    plus_spread = average_reward + spread
    minus_spread = average_reward - spread
    return average_reward, minus_spread, plus_spread


def visualize_performances(*dfs):
    fig, ax = plt.subplots(figsize=(15, 7))

    for i, df in enumerate(dfs, 1):
        average_reward, minus_spread, plus_spread = aggregate_result(df)
        ax.plot(average_reward, label="Agent {}".format(i))
        # ax.fill_between(average_reward.index, minus_spread, plus_spread, alpha=0.3)
    ax.legend(loc=4)
    ax.set_xlabel("Step", fontsize=14)
    ax.set_ylabel("Average Reward", fontsize=14)

In [None]:
visualize_performances(*results)

In [None]:
def print_q_value_estimates(agent_estimates, env):
    rounded_env_values = [round(v, 3) for v in env.means]
    rounded_agent_values = [round(v, 3) for v in agent_estimates]
    print("Environment mean rewards:\t{}".format(rounded_env_values))
    print("Agent estimated q-values:\t{}".format(rounded_agent_values))

In [None]:
# greedy agent with slow learning and optimistic initial values
print_q_value_estimates(agent1, env)

In [None]:
# nongreedy agent with fast learning and optimistic initial values
print_q_value_estimates(agent2, env)

In [None]:
# greedy agent with fast learning and no optimistic initial values
print_q_value_estimates(agent3, env)

In [None]:
# nongreedy agent with fast learning and no optimistic initial values
print_q_value_estimates(agent4, env)