# k-arm bandit based recommendation system

In [1]:
import numpy as np

In [8]:
class RecommendationSystem:
    def __init__(self, k=10, epsilon=0.1, initial_reward=0.0):
        self.k = k 
        self.epsilon = epsilon
        self.q_values = np.full(k, initial_reward)  
        self.action_counts = np.zeros(k)  

    def select_item(self):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.k)
        else:
            return np.argmax(self.q_values)

    def update_estimates(self, action, reward):
        self.action_counts[action] += 1
        alpha = 1 / self.action_counts[action] 
        self.q_values[action] += alpha * (reward - self.q_values[action])

    def run(self, true_rewards, steps=1000):
        rewards = []
        for step in range(steps):
            action = self.select_item()
            reward = np.random.normal(true_rewards[action], 1)
            self.update_estimates(action, reward)
            rewards.append(reward)
        return rewards

In [9]:
k = 15
epsilon = 0.1
true_rewards = np.random.normal(0, 1, k)  

In [10]:
recommender = RecommendationSystem(k=k, epsilon=epsilon)
rewards = recommender.run(true_rewards, steps=1000)

In [11]:
print("Estimated rewards for items:", recommender.q_values)
print("True rewards for items:", true_rewards)

Estimated rewards for items: [ 0.35282296  1.57206815  0.63284933  3.19346708 -0.39117545  0.96970638
  1.11953268  1.19600778  0.26623694  0.19550834 -0.87818413  0.07256211
  1.06697449  0.6053049   0.8131404 ]
True rewards for items: [-0.12298806  1.46944141  0.78836644  3.16393251  0.30763008  0.40905649
  1.12646632  0.99933533  0.37411844  0.07443958 -0.65374475  0.75409298
  1.76732288 -0.25753264  0.83635296]
