In [None]:
# coding: utf-8
import pdb
pdb.set_trace()

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from tqdm import trange

matplotlib.use('Agg')


class Bandit:
    # @k_arm: # of arms
    # @epsilon: probability for exploration in epsilon-greedy algorithm
    # @initial: initial estimation for each action
    # @step_size: constant step size for updating estimations
    # @sample_averages: if True, use sample averages to update estimations instead of constant step size
    # @UCB_param: if not None, use UCB algorithm to select action
    # @gradient: if True, use gradient based bandit algorithm
    # @gradient_baseline: if True, use average reward as baseline for gradient based bandit algorithm
    def __init__(self, k_arm=10, epsilon=0., initial=0., step_size=0.1, sample_averages=False, 
                 true_reward=0.):
        self.k = k_arm
        self.step_size = step_size
        self.sample_averages = sample_averages
        self.indices = np.arange(self.k)
        self.time = 0
        self.true_reward = true_reward
        self.epsilon = epsilon
        self.initial = initial
        

    def reset(self):
        # real reward for each action
        self.q_true = np.random.randn(self.k) + self.true_reward # np.random.normal(0, 0.01, self.k)
        # mean, standard deviation, num

        # estimation for each action
        self.q_estimation = np.zeros(self.k) + self.initial

        # # of chosen times for each action
        self.action_count = np.zeros(self.k)

        self.best_action = np.argmax(self.q_true)

        self.time = 0

    # get an action for this bandit
    def act(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.indices)
        
        q_best = np.max(self.q_estimation)
        return np.random.choice(np.where(self.q_estimation == q_best)[0])
        # 若有两个以上具有相同q_estimation

    # take an action, update estimation for this action
    def step(self, action, nonstationary):
        # generate the reward under N(real reward, 1) with stationary
        mean = np.random.normal(0, nonstationary)
        reward = self.q_true[action] + mean # np.random.randn() + 
        self.time += 1
        self.action_count[action] += 1
        
        if self.sample_averages:
            # update estimation using sample averages
            self.q_estimation[action] += (reward - self.q_estimation[action]) / self.action_count[action]
        else:
            # update estimation with constant step size
            self.q_estimation[action] += self.step_size * (reward - self.q_estimation[action])
        return reward


def simulate(runs, time, bandits, nonstationary):
    rewards = np.zeros((len(bandits), runs, time))
    best_action_counts = np.zeros(rewards.shape)
    for i, bandit in enumerate(bandits):
        for r in trange(runs):
            bandit.reset()
            for t in range(time):
                action = bandit.act()
                reward = bandit.step(action, nonstationary)
                rewards[i, r, t] = reward
                if action == bandit.best_action:
                    best_action_counts[i, r, t] = 1
    mean_best_action_counts = best_action_counts.mean(axis=1)
    mean_rewards = rewards.mean(axis=1)
    return mean_best_action_counts, mean_rewards
    
    
def figure_2_3(runs=2000, time=1000):    
    bandits = []
    bandits.append(Bandit(epsilon=0., sample_averages=False, initial=5))
    bandits.append(Bandit(epsilon=0.1, sample_averages=False, initial=0))
    nonstationary = 1
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='Q1=5,epsilon=0')
    plt.plot(average_rewards[1], label='Q1=0,epailon=0.1')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('time=1000')

    plt.savefig('../images/figure_2_11.png')
    plt.close()

    plt.plot(best_action_counts[0], label='Q1=5,epsilon=0')
    plt.plot(best_action_counts[1], label='Q1=0,epsilon=0.1')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('time=1000')

    plt.savefig('../images/figure_2_12.png')
    plt.close() 
    
    
if __name__ == '__main__':
    figure_2_3()                                              

--Return--
> <ipython-input-3-0ed7cf725108>(3)<module>()->None
-> pdb.set_trace()
(Pdb) c


100%|█████████▉| 1994/2000 [02:42<00:00, 12.80it/s]


Program interrupted. (Use 'cont' to resume).
> /home/theresa/anaconda2/envs/python3/lib/python3.6/site-packages/numpy/core/fromnumeric.py(73)_wrapreduction()
-> if type(obj) is not mu.ndarray:
