In [None]:
import numpy as np
import pandas as pd
import gym
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [None]:
class KArmedBandit(gym.Env):
    
    def __init__(self, n_arms):
        self.n_arms = n_arms        
        self.m_low, self.m_high = 0, 5
        self.s_low, self.s_high = 0.1, 0.3
        
        self.distributions, self.means, self.stds = self.get_distributions()
        self.action_space = gym.spaces.Discrete(n_arms)
        self.observation_space = gym.spaces.Discrete(1)  # there's only one state
    
    def env_start(self, seed):
        np.random.seed(seed)  # set random seed for reward generation process
        return self.observation_space.sample()  # select random sample for start
        
    def step(self, action):
        reward = self.distributions[action]()
        observation, terminal, info = None, False, dict()
        return observation, reward, terminal, info
    
    def reset(self):
        self.distributions, self.means, self.stds = self.get_distributions()
    
    def render(self):
        x = np.linspace(self.m_low - 2*self.s_high, self.m_high + 2*self.s_high, 500)
        fig, ax = plt.subplots(figsize=(10, 5))
        for m, s in zip(self.means, self.stds):
            ax.plot(x, norm.pdf(x, m, s), label="mean: {:.2f}, std: {:.2f}".format(m, s))
        ax.legend()
    
    def get_distributions(self):
        np.random.seed(42)
        means = np.random.uniform(low=self.m_low, high=self.m_high, size=self.n_arms)
        stds = np.random.uniform(low=self.s_low, high=self.s_high, size=self.n_arms)
        distributions = [lambda mi=mi, si=si: np.random.normal(mi, si) for mi, si in zip(means, stds)]
        return distributions, means, stds

In [None]:
class Agent:
    
    def __init__(self):
        self.last_action = None
        self.num_actions = None
        self.q_values = None
        self.step_size = None
        self.epsilon = None
        self.initial_value = 0.0

    def agent_init(self, agent_setup):
        self.n_actions = agent_setup['n_actions']
        self.initial_value = agent_setup["initial_value"]
        self.q_values = np.ones(self.n_actions) * self.initial_value
        self.step_size = agent_setup['step_size']
        self.epsilon = agent_setup['epsilon']

        self.last_action = 0
        
    @staticmethod
    def argmax(q_values):
        max_value = np.max(q_values)
        max_indices = np.where(q_values == max_value)[0]
        return np.random.choice(max_indices)
        
    def choose_action(self, observation):
        if np.random.uniform(0, 1) < self.epsilon:
            action = np.random.choice(self.n_actions)
        else:
            action = self.argmax(self.q_values)
        return action

    def agent_start(self, observation):
        self.last_action = self.choose_action(observation)
        return self.last_action

    def agent_step(self, reward, observation):
        self.q_values[self.last_action] += self.step_size * (reward - self.q_values[self.last_action])
        self.last_action = self.choose_action(observation)
        return self.last_action

    def agent_end(self, reward):
        self.q_values[self.last_action] += self.step_size * (reward - self.q_values[self.last_action])

    def agent_cleanup(self):
        pass

    def agent_message(self, message):
        pass

In [None]:
n_arms = 4

env_setup = {'n_arms': n_arms}
agent_setup = {'n_actions': n_arms, 'initial_value': 10, 'step_size': 0.01, 'epsilon': 0.01}

In [None]:
def run_expriment(agent_init, n_episodes=20, steps_per_episode=2000):
    all_rewards = []
    for e in tqdm(range(n_episodes)):
        rewards = []
        env = KArmedBandit(**env_setup)
        observation = env.env_start(e)
        agent = Agent()
        agent.agent_init(agent_init)
        action = agent.agent_start(observation)
        observation, reward, terminal, info = env.step(action)
        for step in range(steps_per_episode):
            action = agent.agent_step(reward, observation)
            observation, reward, terminal, info = env.step(action)
            rewards.append(reward)
        agent.agent_end(reward)
        all_rewards.append(rewards)
    return pd.DataFrame(np.array(all_rewards).T), env

In [None]:
df_rewards1, _ = run_expriment({'n_actions': n_arms, 'initial_value': 10, 'step_size': 0.01, 'epsilon': 0.01})
df_rewards2, _ = run_expriment({'n_actions': n_arms, 'initial_value': 10, 'step_size': 0.1, 'epsilon': 0.1})
df_rewards3, _ = run_expriment({'n_actions': n_arms, 'initial_value': 0, 'step_size': 0.1, 'epsilon': 0.1})
_.render()

In [None]:
def aggregate_result(df_rewards):
    window = 50
    average_reward = df_rewards.rolling(window=window).mean().mean(axis=1)
    spread = df_rewards.rolling(window=window).mean().std(axis=1)
    plus_spread = average_reward + spread
    minus_spread = average_reward - spread
    return average_reward, minus_spread, plus_spread

In [None]:
def visualize_performances(*dfs):
    fig, ax = plt.subplots(figsize=(10, 6))

    for i, df in enumerate(dfs, 1):
        average_reward, minus_spread, plus_spread = aggregate_result(df)
        ax.plot(average_reward)
        ax.fill_between(average_reward.index, minus_spread, plus_spread, alpha=0.5, label=i)
    ax.legend(loc=4)

In [None]:
visualize_performances(df_rewards1, df_rewards2, df_rewards3)