In [6]:
# coding: utf-8
import pdb
pdb.set_trace()

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from tqdm import trange

matplotlib.use('Agg')


class Bandit:
    # @k_arm: # of arms
    # @epsilon: probability for exploration in epsilon-greedy algorithm
    # @initial: initial estimation for each action
    # @step_size: constant step size for updating estimations
    # @sample_averages: if True, use sample averages to update estimations instead of constant step size
    # @UCB_param: if not None, use UCB algorithm to select action
    # @gradient: if True, use gradient based bandit algorithm
    # @gradient_baseline: if True, use average reward as baseline for gradient based bandit algorithm
    def __init__(self, k_arm=10, epsilon=0., initial=0., step_size=0.1, sample_averages=False, 
                 true_reward=0., bias=False):
        self.k = k_arm
        self.step_size = step_size
        self.sample_averages = sample_averages
        self.bias = bias
        self.indices = np.arange(self.k)
        self.time = 0
        self.true_reward = true_reward
        self.epsilon = epsilon
        self.initial = initial
        

    def reset(self):
        # real reward for each action
        self.q_true = np.random.randn(self.k) + self.true_reward # np.random.normal(0, 0.01, self.k)
        # mean, standard deviation, num

        # estimation for each action
        self.q_estimation = np.zeros(self.k) + self.initial

        # # of chosen times for each action
        self.action_count = np.zeros(self.k)

        self.best_action = np.argmax(self.q_true)

        self.time = 0
        
        self.o = np.zeros(self.k)
        
        self.beta = np.zeros(self.k)

    # get an action for this bandit
    def act(self):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.indices)
        
        q_best = np.max(self.q_estimation)
        return np.random.choice(np.where(self.q_estimation == q_best)[0])
        # 若有两个以上具有相同q_estimation

    # take an action, update estimation for this action
    def step(self, action, nonstationary):
        # generate the reward under N(real reward, 1) with stationary
        mean = np.random.normal(0, nonstationary)
        reward = self.q_true[action] + mean # np.random.randn() + 
        self.time += 1
        self.action_count[action] += 1
        
        if self.sample_averages:
            # update estimation using sample averages
            self.q_estimation[action] += (reward - self.q_estimation[action]) / self.action_count[action]
        elif self.bias:
            self.o[action] += self.step_size * (1 - self.o[action])
            self.beta[action] = self.step_size / self.o[action]
            self.q_estimation[action] += self.beta[action] * (reward - self.q_estimation[action])
        else:
            # update estimation with constant step size
            self.q_estimation[action] += self.step_size * (reward - self.q_estimation[action])
        return reward


def simulate(runs, time, bandits, nonstationary):
    rewards = np.zeros((len(bandits), runs, time))
    best_action_counts = np.zeros(rewards.shape)
    for i, bandit in enumerate(bandits):
        for r in trange(runs):
            bandit.reset()
            for t in range(time):
                action = bandit.act()
                reward = bandit.step(action, nonstationary)
                rewards[i, r, t] = reward
                if action == bandit.best_action:
                    best_action_counts[i, r, t] = 1
    mean_best_action_counts = best_action_counts.mean(axis=1)
    mean_rewards = rewards.mean(axis=1)
    return mean_best_action_counts, mean_rewards

def figure_2_1(runs=2000, time=800):    
    bandits = []
    bandits.append(Bandit(epsilon=0., bias=True, initial=5))
    bandits.append(Bandit(epsilon=0., bias=False, initial=5))
    nonstationary = 1
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='bias exponential recency-weighted average')
    plt.plot(average_rewards[1], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('epsilon=0')

    plt.savefig('../images/figure_2_7_1.png')
    plt.close()

    plt.plot(best_action_counts[0], label='bias exponential recency-weighted average')
    plt.plot(best_action_counts[1], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('epsilon=0')

    plt.savefig('../images/figure_2_7_2.png')
    plt.close()
    
def figure_2_2(runs=2000, time=800):    
    bandits = []
    bandits.append(Bandit(epsilon=0.1, bias=True, initial=5))
    bandits.append(Bandit(epsilon=0.1, bias=False, initial=5))
    nonstationary = 1
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='bias exponential recency-weighted average')
    plt.plot(average_rewards[1], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('epsilon=0.1')

    plt.savefig('../images/figure_2_7_3.png')
    plt.close()

    plt.plot(best_action_counts[0], label='bias exponential recency-weighted average')
    plt.plot(best_action_counts[1], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('epsilon=0.1')

    plt.savefig('../images/figure_2_7_4.png')
    plt.close() 
    
    
def figure_2_3(runs=2000, time=800):    
    bandits = []
    bandits.append(Bandit(epsilon=0.1, bias=False, initial=5))
    bandits.append(Bandit(epsilon=0., bias=False, initial=5))
    nonstationary = 1
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='epsilon=0.1')
    plt.plot(average_rewards[1], label='epsilon=0')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('exponential recency-weighted average')

    plt.savefig('../images/figure_2_7_5.png')
    plt.close()

    plt.plot(best_action_counts[0], label='epsilon=0.1')
    plt.plot(best_action_counts[1], label='epsilon=0')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('exponential recency-weighted average')

    plt.savefig('../images/figure_2_7_6.png')
    plt.close() 
    
def figure_2_4(runs=2000, time=50):    
    bandits = []
    bandits.append(Bandit(epsilon=0., bias=True, initial=50))
    bandits.append(Bandit(epsilon=0., bias=False, initial=50))
    nonstationary = 1
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='bias exponential recency-weighted average')
    plt.plot(average_rewards[1], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('initial=50')

    plt.savefig('../images/figure_2_7_7.png')
    plt.close()

    plt.plot(best_action_counts[0], label='bias exponential recency-weighted average')
    plt.plot(best_action_counts[1], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('initial=50')

    plt.savefig('../images/figure_2_7_8.png')
    plt.close()
    
def figure_2_5(runs=2000, time=300):    
    bandits = []
    bandits.append(Bandit(epsilon=0., sample_averages=True, initial=0))
    bandits.append(Bandit(epsilon=0., bias=True, initial=0))
    bandits.append(Bandit(epsilon=0., bias=False, initial=0))
    nonstationary = 2
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='sample_average')
    plt.plot(average_rewards[1], label='bias exponential recency-weighted average')
    plt.plot(average_rewards[2], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('nonstationary=2, epsilon=0')

    plt.savefig('../images/figure_2_7_9.png')
    plt.close()

    plt.plot(best_action_counts[0], label='sample_average')
    plt.plot(best_action_counts[1], label='bias exponential recency-weighted average')
    plt.plot(best_action_counts[2], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('nonstationary=2, epsilon=0')

    plt.savefig('../images/figure_2_7_10.png')
    plt.close()
    
def figure_2_6(runs=2000, time=1000):    
    bandits = []
    bandits.append(Bandit(epsilon=0.1, sample_averages=True, initial=0))
    bandits.append(Bandit(epsilon=0.1, bias=True, initial=0))
    bandits.append(Bandit(epsilon=0.1, bias=False, initial=0))
    nonstationary = 2
    best_action_counts, average_rewards = simulate(runs, time, bandits, nonstationary)

    plt.plot(average_rewards[0], label='sample_average')
    plt.plot(average_rewards[1], label='bias exponential recency-weighted average')
    plt.plot(average_rewards[2], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('Average reward')
    plt.legend()
    plt.title('nonstationary=2, epsilon=0.1')

    plt.savefig('../images/figure_2_7_11.png')
    plt.close()

    plt.plot(best_action_counts[0], label='sample_average')
    plt.plot(best_action_counts[1], label='bias exponential recency-weighted average')
    plt.plot(best_action_counts[2], label='exponential recency-weighted average')
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.title('nonstationary=2, epsilon=0.1')

    plt.savefig('../images/figure_2_7_12.png')
    plt.close()
    

    
    
if __name__ == '__main__':
    figure_2_1()
    figure_2_2() 
    figure_2_3()
    figure_2_4()
    figure_2_5()
    figure_2_6()

--Return--
> <ipython-input-6-638e69289976>(3)<module>()->None
-> pdb.set_trace()
(Pdb) c


100%|██████████| 2000/2000 [00:27<00:00, 67.39it/s]
100%|██████████| 2000/2000 [00:25<00:00, 78.23it/s]
100%|██████████| 2000/2000 [00:28<00:00, 69.24it/s]
100%|██████████| 2000/2000 [00:24<00:00, 83.30it/s]
100%|██████████| 2000/2000 [00:24<00:00, 79.28it/s]
100%|██████████| 2000/2000 [00:26<00:00, 76.90it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1059.05it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1220.43it/s]
100%|██████████| 2000/2000 [00:09<00:00, 202.47it/s]
100%|██████████| 2000/2000 [00:10<00:00, 184.60it/s]
100%|██████████| 2000/2000 [00:09<00:00, 205.14it/s]
100%|██████████| 2000/2000 [00:31<00:00, 62.88it/s]
100%|██████████| 2000/2000 [00:34<00:00, 57.98it/s]
100%|██████████| 2000/2000 [00:29<00:00, 67.99it/s]
