1. Implement a simulation of the k-ardmed bandits environment with a variable value of k and a random p_i probabilities to obtain a reward of 0 or 1 from pulling each machine. Probabilities should be different each time you instance the environment.

In [158]:
import numpy as np
import matplotlib.pyplot as plt

In [159]:
class Bandit:
    def __init__(self, num_bandits, bandit_probs):
        self.k = num_bandits # Number of probabilities
        self.probs = bandit_probs       
        
    def get_reward(self,action):
        p_i = np.round(np.random.random(),2)
        reward = 1 if (p_i < self.probs[action]) else 0
#         print(' Reward calculated')
        return reward


In [160]:
class Agent:
    def __init__(self, bandit, epsilon):
        # epsilon to control the type of agent (Explorer or greedy)
        self.epsilon = epsilon
        # number of times action was chosen
        self.n_actions = np.zeros(bandit.k, dtype=np.int) 
        # Estimated values? of what?
        self.Q = np.zeros(bandit.k, dtype=np.float)
    
    # Update Q action-value based on the formula
    # Q(a) = Q(a) + 1/(k+1) * (r(a) - Q(a))
    def update_Q(self, action, reward):
        self.n_actions[action] += 1
        self.Q[action] += (1/self.n_actions[action]) * (reward - self.Q[action])
        
    def choose_action(self, bandit, force_explore=False):
        rand = np.round(np.random.random(),2)
#         print(' rand: ', rand)
        if (rand < self.epsilon) or force_explore:
#             print(' I wanna explore')
            action_explore = np.random.randint(bandit.k) #Explore random bandits?
            return action_explore
        else:
#             print(' I wanna stay')
#             print(' Q', self.Q)
#             print(' Q_max:', self.Q.max())
#             print (' flat: ',np.flatnonzero(self.Q == self.Q.max()))
            action_greedy = np.random.choice(np.flatnonzero(self.Q == self.Q.max()))
            return action_greedy       

In [161]:
def experiment(agent, bandit, N_episodes):
        action_history = []
        reward_history = []
        
        for episode in range(N_episodes):
            # Choose action from agent 
            action = agent.choose_action(bandit)
#             print(' Corresponding action: ', action)
            reward = bandit.get_reward(action)
#             print(' r: ', reward)
            agent.update_Q(action, reward)
            action_history.append(action)
            reward_history.append(reward)
        return np.array(action_history), np.array(reward_history)

In [164]:
# --------------------- 
#  k-bandit simulation           
# --------------------- 

# number of bandits
num_bandits = 10
epsilon = 0.1
n_episodes = 100
n_experiments = 20

print(' Running bandits experiment with {} bandits and agent with epsilon of {}'.format(num_bandits, epsilon))

# for i in xrange(n_experiments):

# Do we need history?
reward_history_avg = np.zeros(n_episodes)
action_history_sum = np.zeros((n_episodes, num_bandits))

bandit_probs = [ np.round(np.random.rand(),2) for i in range(num_bandits)]

best_bandit = np.argmax(bandit_probs)
print(' Probabilities calculated', bandit_probs)
print(' Best bandit is ', best_bandit+1)

for ex in range(n_experiments):
    print(' Experiment: ', ex)
    bandit = Bandit(num_bandits,bandit_probs) # Initialize bandits
    agent = Agent(bandit, epsilon) # Initialize agents

    action_history, reward_history = experiment(agent, bandit, n_episodes)

#     print(' Action history: ', action_history)

    # Sum up experiment reward 
    reward_history_avg += reward_history
    print(' Reward history avg: ', reward_history_avg)

    # Sum up action history
    for episodes_idx, (a) in enumerate(action_history): 
        action_history_sum[episodes_idx][a] += 1
    
reward_history_avg /= np.float(n_experiments)

print(' reward history avg = {}'.format(reward_history_avg))


print(' action history avg = {}'.format(action_history_sum))
# #------------------------------
# # Plot reward history
# #------------------------------
# plt.figure(figsize=(18, 12))
# plt.plot(reward_history_avg)
# plt.xlabel('Episode number')
# plt.ylabel('Rewards collected'.format(n_experiments))
# plt.title('Bandit reward history averaged over {} experiments(epsilon = {})'.format(n_experiments, epsilon))
# ax = plt.gca()
# ax.set_xscale('log', nonposx='clip')
# plt.xlim([1, n_episodes])
# plt.show()

# # =========================
# # Plot action history results
# # =========================
# plt.figure(figsize=(18, 12))
# for i in range(num_bandits):
#     action_history_sum_plot = 100 * action_history_sum[:,i] / n_experiments
#     plt.plot(list(np.array(range(len(action_history_sum_plot)))+1),
#                  action_history_sum_plot,
#                  linewidth=5.0,
#                  label="Bandit #{}".format(i+1))
# plt.title("Bandit action history averaged over {} experiments (epsilon = {})".format(n_experiments, epsilon), fontsize=26)
# plt.xlabel("Episode Number", fontsize=26)
# plt.ylabel("Bandit Action Choices (%)", fontsize=26)
# leg = plt.legend(loc='upper left', shadow=True, fontsize=26)
# ax = plt.gca()
# ax.set_xscale("log", nonposx='clip')
# plt.xlim([1, n_episodes])
# plt.ylim([0, 100])
# plt.xticks(fontsize=24)
# plt.yticks(fontsize=24)
# for legobj in leg.legendHandles:
#     legobj.set_linewidth(16.0)
# plt.show()


 Running bandits experiment with 10 bandits and agent with epsilon of 0.1
 Probabilities calculated [0.98, 0.38, 0.75, 0.37, 0.85, 0.58, 0.75, 0.19, 0.39, 0.59]
 Best bandit is  1
 Experiment:  0
 Reward history avg:  [1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1.]
 Experiment:  1
 Reward history avg:  [1. 0. 1. 2. 1. 1. 2. 1. 1. 1. 1. 2. 1. 2. 0. 1. 2. 1. 1. 1. 1. 2. 0. 1.
 1. 1. 2. 1. 2. 2. 2. 2. 1. 1. 2. 2. 1. 2. 1. 2. 2. 2. 2. 2. 2. 2. 1. 2.
 2. 2. 2. 1. 2. 1. 1. 2. 1. 1. 1. 2. 0. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 1. 1. 2. 1. 1. 2. 2. 2. 2. 0. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1.
 2. 2. 2. 2.]
 Experiment:  2
 Reward history avg:  [1. 0. 2. 3. 2. 2. 3. 2. 2. 2. 2. 3. 2. 3. 1. 2. 2. 2. 2. 2. 2. 3. 1. 2.
 2. 2. 3. 2. 3. 3. 3.

In [165]:
agent.Q

array([0.97333333, 0.        , 0.        , 0.        , 0.85714286,
       0.5       , 0.33333333, 0.        , 0.        , 0.5       ])