In [101]:
import numpy as np

class KArmedBanditRecommender:
    
    def __init__(self, k, epsilon=0.1):
        self.num_arms = k
        self.epsilon = epsilon
        self.arm_pulls = np.zeros(k)  # Track the number of pulls for each arm
        self.estimated_values = np.zeros(k)  # Estimated reward values for each arm

    def select_arm(self):
        # Epsilon-greedy strategy to select between exploration and exploitation
        if np.random.random() < self.epsilon:  # Exploration: choose a random item
            return np.random.choice(self.num_arms)
        else:
            return np.argmax(self.estimated_values)  # Exploitation: Select item with highest estimated reward

    def update_estimate(self, chosen_arm, observed_reward):
        # Update pull count and estimated reward for the selected arm
        self.arm_pulls[chosen_arm] += 1
        
        n = self.arm_pulls[chosen_arm]
        
        current_value = self.estimated_values[chosen_arm]
        # Incremental update of the mean reward for the selected arm
        self.estimated_values[chosen_arm] += (observed_reward - current_value) / n

    def run_simulation(self, n_rounds, true_probabilities):
        rewards_list = []
        
        for _ in range(n_rounds):
            # Choose an arm to pull: the item
            selected_arm = self.select_arm()
            
            # Obtain a reward based on the true reward distribution
            reward = np.random.binomial(1, true_probabilities[selected_arm])
            rewards_list.append(reward)
            
            # Update estimated value for the selected item
            self.update_estimate(selected_arm, reward)
        
        return rewards_list

In [102]:
k = 5  # Number of arms
epsilon = 0.1  # Exploration probability
n_rounds = 1000  # Total simulation rounds

In [103]:
# True reward probabilities for each arm
true_probabilities = [0.1, 0.3, 0.5, 0.8, 0.6]

In [104]:
bandit_recommender = KArmedBanditRecommender(k, epsilon)
rewards = bandit_recommender.run_simulation(n_rounds, true_probabilities)

In [105]:
average_reward = np.mean(rewards)
print("Average reward over", n_rounds, "rounds:", average_reward)

Average reward over 1000 rounds: 0.762
