In [1]:
import numpy as np
import random

class ThompsonSampling:
    def __init__(self, n_arms):
        self.n = n_arms
        self.alpha = np.ones(n_arms)
        self.beta = np.ones(n_arms)
        self.sampled_means = np.zeros(n_arms)

    def select_arm(self):
        sampled_theta = [np.random.beta(self.alpha[i], self.beta[i]) for i in range(self.n)]
        return np.argmax(sampled_theta)

    def update(self, arm, reward):
        if reward == 1:
            self.alpha[arm] += 1
        else:
            self.beta[arm] += 1

        self.sampled_means[arm] = np.random.beta(self.alpha[arm], self.beta[arm])

    def get_total_reward(self):
        return np.sum(self.alpha) / (np.sum(self.alpha) + np.sum(self.beta))


# Example usage
# Initialize Thompson Sampling with the number of arms
num_arms = 5
thompson_sampling = ThompsonSampling(num_arms)

# Simulate some rounds
num_rounds = 1000
for _ in range(num_rounds):
    chosen_arm = thompson_sampling.select_arm()

    # Simulating reward, assuming some random generation process
    if random.random() < 0.7:  # Assuming a success rate of 0.7
        reward = 1
    else:
        reward = 0

    thompson_sampling.update(chosen_arm, reward)

# Get the arm with the highest sampled mean
best_arm = np.argmax(thompson_sampling.sampled_means)

print(f"The best arm is {best_arm} with a success rate of {thompson_sampling.sampled_means[best_arm]}")
print(f"The total reward: {thompson_sampling.get_total_reward()}")


The best arm is 0 with a success rate of 0.7180898394460574
The total reward: 0.6782178217821783
