In [None]:
import matplotlib.pyplot as plt
import numpy as np

class Bandit:
  def __init__(self, p):
    self.p = p # the winning rate
    self.p_estimate = 5. # estimation of the winning rate, intialized to 5, an extremely high value
    self.N = 1. # number of samples collected, initialized to 1 due to initial p_estimate of 5

  def pull(self):
    # draw a random probability p and check if won according to the winning rate
    return np.random.random() < self.p

  def update(self, x):
    # increment numbers of samples collected
    self.N += 1.
    # calculate the new p hat from the previous p hat and the newly obtained value
    self.p_estimate = ((self.N - 1) * self.p_estimate + x) / self.N

def run_experiment(bandits_probs_list, N):
  # create a list of bandit objects according to their probabilities of win rate
  bandits = [Bandit(p) for p in bandits_probs_list]
  # initialize variables
  rewards = np.zeros(N)

  for i in range(N):
    # use optimistic initial values to select the next bandit
    j = np.argmax([b.p_estimate for b in bandits])
    # pull the arm for the selected bandit
    x = bandits[j].pull()
    # update the rewards collection
    rewards[i] = x
    # update the distribution with the obtained value from the new bandit
    bandits[j].update(x)

  # print mean estimates for each bandit
  for i, b in enumerate(bandits):
    print(f'bandit{i + 1} estimate win-rate: {round(b.p_estimate, 3)} | true win_rate: {b.p}')

  # print total reward
  print()
  print('total reward:', rewards.sum())
  print('overall win-rate:', rewards.sum() / N)
  print('number of times selected each bandit:', [b.N for b in bandits])

  # plot the results
  cumulative_rewards = np.cumsum(rewards)
  win_rates = cumulative_rewards / (np.arange(N) + 1)
  plt.ylim([0, 1])
  plt.plot(win_rates)
  plt.plot(np.ones(N) * np.max(bandits_probs_list))
  plt.title('cumulative win-rate over time')
  plt.xlabel('number of trials')
  plt.ylabel('win-rate')
  plt.show()

if __name__ == '__main__':
  # simulate a multi-armed bandit problem with 5 machines with win-rates 0, 0.25, 0.5, 0.75
  # default to 10000 trials
  run_experiment([0, 0.25, 0.5, 0.75], 10000)