In [1]:
import numpy as np

In [2]:
class QLearningMovieRecommender:
    def __init__(self, num_users, num_movies, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.2):
        self.num_users = num_users
        self.num_movies = num_movies
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploraion_prob = exploration_prob
        self.q_table = np.zeros((num_users, num_movies))

    def choose_action(self, user):
        if np.random.rand() < self.exploraion_prob:
            #   Explore: choose a random movie
            return np.random.randint(self.num_movies)
        else:
            #   Exploit: choose the  movie with the highest Q-value
            return np.argmax(self.q_table[user, :])
        
    def update_q_table(self, user, action, reward):
        #   Q-learning update rule
        self.q_table[user, action] = (1 - self.learning_rate) * self.q_table[user, action] + self.learning_rate * (reward + self.discount_factor * np.max(self.q_table[user, :]))

    def recommend(self, user):
        #   Recommend the movie with the highest Q-value for the user
        return np.argmax(self.q_table[user, :])

In [37]:
#   Example usage
num_users = 3
num_movies = 5
movie_recommender = QLearningMovieRecommender(num_users, num_movies)

#   Simulate interactions and learning
for _ in range(1000):
    user = np.random.randint(num_users)
    action = movie_recommender.choose_action(user)
    reward = np.random.rand()   # In a real-world scenario, the reward would
    movie_recommender.update_q_table(user, action, reward)

#Make recommendations for each user
for user in range(num_users):
    recommended_movie = movie_recommender.recommend(user)
    print(f"User {user} - Recommended Movie: {recommended_movie}")

User 0 - Recommended Movie: 1
User 1 - Recommended Movie: 0
User 2 - Recommended Movie: 4
