In [1]:
import numpy as np
from matplotlib import pyplot as plt

In [2]:
class BernoulliBandit:
    """ 伯努利多臂老虎机,输入K表示拉杆个数 """
    def __init__(self, K):
        self.probs = np.random.uniform(size=K)  # 随机生成K个0～1的数,作为拉动每根拉杆的获奖
        # 概率
        self.best_idx = np.argmax(self.probs)  # 获奖概率最大的拉杆
        self.best_prob = self.probs[self.best_idx]  # 最大的获奖概率
        self.K = K

    def step(self, k):
        # 当玩家选择了k号拉杆后,根据拉动该老虎机的k号拉杆获得奖励的概率返回1（获奖）或0（未
        # 获奖）
        if np.random.rand() < self.probs[k]:
            return 1
        else:
            return 0


np.random.seed(1)  # 设定随机种子,使实验具有可重复性
K = num = 10
bandit_10_arm = BernoulliBandit(K)
print("随机生成了一个%d臂伯努利老虎机" % K)
print("获奖概率最大的拉杆为%d号,其获奖概率为%.4f" %
      (bandit_10_arm.best_idx, bandit_10_arm.best_prob))

随机生成了一个10臂伯努利老虎机
获奖概率最大的拉杆为1号,其获奖概率为0.7203


模拟估计奖励的过程，使用增量式更新。单个拉杆 $a$ 的期望奖励使用前面奖励 $r_a^k$ 的平均数估计，即

$$
\hat{Q} (a) = \frac{1}{N(a)} \sum_{k = 1}^n r_a^k
$$

In [6]:
class Estimator:
    def __init__(self, machine: BernoulliBandit):
        self.num = machine.K
        self.agent = machine
        self.counts = np.zeros(self.num, dtype=np.int32)
        self.rewards = np.zeros(self.num, dtype=np.float64)
    
    def run(self, round: int):
        """
        Estimate for a given round
        """
        samples = np.random.randint(0, self.num, size=round)
        for a in samples:
            r = self.agent.step(a)
            self.counts[a] += 1
            self.rewards[a] += (r - self.rewards[a]) / self.counts[a]
    
    def get_estimation(self):
        return self.rewards


estimator = Estimator(bandit_10_arm)

In [10]:
estimator.run(10000)
np.set_printoptions(precision=3, suppress=True)
print("Estimation:", estimator.get_estimation())
print("Ground truth:", bandit_10_arm.probs)

Estimation: [0.432 0.713 0.    0.301 0.16  0.093 0.193 0.343 0.408 0.515]
Ground truth: [0.417 0.72  0.    0.302 0.147 0.092 0.186 0.346 0.397 0.539]


懊悔（regret）：定义为拉动一个拉杆 $k$ 时，最优拉杆与当前选择拉杆的期望之差；
累计懊悔（cumulative regret）定义为每次行动的懊悔之和

In [12]:
class Solver:
    def __init__(self, bandit: BernoulliBandit):
        self.bandit = bandit
        self.num = bandit.K
        self.counts = np.zeros(self.num, dtype=np.int32)
        self.regret = 0. # Cumulative regret
        self.actions = []
        self.regrets = []
    
    def run_one_step(self):
        """
        选择一个动作
        此时没有 state，不需要输入
        """
        a = np.random.randint(0, self.num)
        return a
    
    def update_regret(self, action: int):
        regret = self.bandit.best_prob - self.bandit.probs[action]
        self.regret += regret
        self.regrets.append(regret)

    def run(self, num_steps: int):
        for _ in range(num_steps):
            a = self.run_one_step()
            self.counts[a] += 1
            self.update_regret(a)
            self.actions.append(a)

solver = Solver(bandit_10_arm)

In [13]:
solver.run(10000)
solver.regret