In [2]:
import numpy as np
import matplotlib.pyplot as plt
import random

In [156]:

class multiArmBandits:
    
    def __init__(self, N, episodes):
        
        self.N = N # total bandits 
        self.episodes = episodes
        self.epsilon = 0.1 # epislon greedy, 10% explore 90% exploitation
        self.bandits_probs = [round(random.random(),2) for i in range(self.N)]
        self.k = np.zeros(self.N, dtype=np.int)  # number of times action was chosen
        self.Q = np.zeros(self.N, dtype=np.float)  # estimated value

    # Generate bandits probability
    def generate_bandits_prob(self):
        
        bandits = [round(random.random(),2) for i in range(self.N)]
        return bandits
        
    def get_reward(self, action):
        
        rand = random.random()
        reward = 1 if (rand < self.bandits_probs[action]) else 0
        return reward
    
    def choose_action(self, force_explore=False):
        
        rand = random.random()
        # 10% we choose to explore
        if (rand < self.epsilon) or force_explore:
            action_explore = random.randint(0, self.N-1) 
            return action_explore
        else:
            # choose action with maximum reward, called as greedy action
            action_greedy = np.random.choice(np.flatnonzero(self.Q == self.Q.max()))
            return action_greedy        
        
    def update_Q(self, action, reward):
        
        self.k[action] += 1
        self.Q[action] += ((reward - self.Q[action])/self.k[action])
        
        
    '''
        1. Choose action based on epsilon greedy
        2. Get reward based on that action
        3. Update action value estimate
    '''
    
    def main(self):
        
        track_actions = []
        track_rewards = []
        
        reward_history_avg = np.zeros(self.episodes)  # reward history experiment-averaged
        action_history_sum = np.zeros((self.episodes, self.N))  # sum action history
        
        for i in range(self.episodes):
            
            action = self.choose_action()
            reward = self.get_reward(action)
            self.update_Q(action, reward)
            track_actions.append(action)
            track_rewards.append(reward)
           
        print "update Q"
        print self.Q
        print "action update ", self.k
        print "track actions ", track_actions
        print "track rewards ", track_rewards
            

N = 10
episodes = 100
k_armBandits = multiArmBandits(k, episodes)
k_armBandits.main()

update Q
[0.5        0.         0.         0.94285714 0.         0.
 0.6        0.         0.         0.25      ]
action update  [ 2  2  2 70  1  1 15  2  1  4]
track actions  [7, 4, 9, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 3, 3, 3, 9, 3, 3, 3, 5, 3, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 3, 2, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 1, 3, 3, 3, 3, 3, 8, 3, 3, 3, 3, 1, 3, 3, 3]
track rewards  [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1]
