# CineMatchPlus RL Agent

This notebook implements a Reinforcement Learning agent (Multi-Armed Bandit) to learn user preferences based on the feedback logged in `feedback.csv`.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast

# Load Feedback Data
try:
    df = pd.read_csv('feedback.csv')
    print("Data loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("feedback.csv not found. Please run the app and generate some feedback first.")
    df = pd.DataFrame()

## Preprocessing
We need to extract genres and convert actions (like/dislike) into rewards.

In [None]:
if not df.empty:
    # Convert genre_ids string representation to list
    df['genre_ids'] = df['genre_ids'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    
    # Map action to reward
    # Like = 1, Dislike = -1
    df['reward'] = df['action'].map({'like': 1, 'dislike': -1})
    
    # Flatten the data to have one row per genre-interaction
    # This is a simplification: we treat a movie like as a positive signal for ALL its genres
    genre_rewards = []
    for _, row in df.iterrows():
        for genre_id in row['genre_ids']:
            genre_rewards.append({'genre_id': genre_id, 'reward': row['reward']})
            
    interaction_df = pd.DataFrame(genre_rewards)
    print(interaction_df.head())
else:
    print("No data to process.")

## Multi-Armed Bandit (Epsilon-Greedy)
We treat each Genre as an "Arm". The agent tries to find the genre with the highest expected reward.

In [None]:
class EpsilonGreedyAgent:
    def __init__(self, n_arms, epsilon=0.1):
        self.n_arms = n_arms
        self.epsilon = epsilon
        self.counts = np.zeros(n_arms)
        self.values = np.zeros(n_arms) # Estimated value (mean reward)

    def select_arm(self):
        if np.random.random() < self.epsilon:
            return np.random.randint(self.n_arms)
        else:
            return np.argmax(self.values)

    def update(self, arm, reward):
        self.counts[arm] += 1
        n = self.counts[arm]
        value = self.values[arm]
        # Incremental mean update
        new_value = ((n - 1) / n) * value + (1 / n) * reward
        self.values[arm] = new_value

# Identify unique genres (Arms)
if not df.empty and not interaction_df.empty:
    unique_genres = interaction_df['genre_id'].unique()
    genre_to_arm = {genre: i for i, genre in enumerate(unique_genres)}
    arm_to_genre = {i: genre for genre, i in genre_to_arm.items()}
    n_arms = len(unique_genres)
    
    agent = EpsilonGreedyAgent(n_arms, epsilon=0.1)
    
    # Simulate learning from history
    rewards_history = []
    for _, row in interaction_df.iterrows():
        arm = genre_to_arm[row['genre_id']]
        reward = row['reward']
        agent.update(arm, reward)
        rewards_history.append(reward)
        
    print("Training complete.")
    
    # Show learned preferences
    print("\nLearned Genre Preferences (Estimated Value):")
    for i in range(n_arms):
        print(f"Genre ID {arm_to_genre[i]}: {agent.values[i]:.2f}")
        
    best_arm = np.argmax(agent.values)
    print(f"\nBest Genre ID: {arm_to_genre[best_arm]}")
else:
    print("Not enough data to train agent.")

## Visualization

In [None]:
if not df.empty and not interaction_df.empty:
    plt.figure(figsize=(10, 6))
    plt.bar([str(arm_to_genre[i]) for i in range(n_arms)], agent.values)
    plt.xlabel('Genre ID')
    plt.ylabel('Estimated Value (Mean Reward)')
    plt.title('Learned User Preferences by Genre')
    plt.show()
    
    plt.figure(figsize=(10, 6))
    cumulative_average = np.cumsum(rewards_history) / (np.arange(len(rewards_history)) + 1)
    plt.plot(cumulative_average)
    plt.xlabel('Interactions')
    plt.ylabel('Cumulative Average Reward')
    plt.title('Agent Performance Over Time')
    plt.show()