In [1]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict
import math
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
class MovieRecommender:
    def __init__(self, data_path: str):
        # Load and preprocess the data
        self.df = pd.read_csv(data_path)
        self.preprocess_data()
        
        # Initialize environment variables
        self.n_arms = len(self.movies)
        self.reset_stats()

    def preprocess_data(self):
        # Convert ratings to binary rewards (4-5 stars = 1, 1-3 stars = 0)
        self.df['reward'] = (self.df['rating'] >= 4).astype(int)
        
        # Get unique movies and their initial statistics
        self.movies = self.df['movieId'].unique()
        self.movie_stats = self.df.groupby('movieId').agg({
            'reward': ['count', 'mean']
        }).reset_index()
        self.movie_stats.columns = ['MovieID', 'total_ratings', 'mean_reward']
        
        # Create movie index mapping for easier access
        self.movie_to_arm = {movie: idx for idx, movie in enumerate(self.movies)}
        self.arm_to_movie = {idx: movie for movie, idx in self.movie_to_arm.items()}

    def reset_stats(self):
        self.counts = np.zeros(self.n_arms)
        self.rewards = np.zeros(self.n_arms)
        self.cumulative_rewards = []
        self.chosen_arms = []

    def get_reward(self, movie_id: int) -> int:
        movie_stats = self.movie_stats[self.movie_stats['MovieID'] == movie_id].iloc[0]
        return np.random.binomial(1, movie_stats['mean_reward'])

    def simulate_policy(self, policy_func, n_iterations: int, **policy_params) -> Tuple[List[int], List[float]]:
        self.reset_stats()
        
        print(f"\nRunning {policy_func.__name__} for {n_iterations} iterations:")
        cumulative_reward = 0
        
        for t in range(n_iterations):
            # Choose arm according to policy
            chosen_arm = policy_func(**policy_params)
            movie_id = self.arm_to_movie[chosen_arm]
            
            # Get reward
            reward = self.get_reward(movie_id)
            
            # Update statistics
            self.counts[chosen_arm] += 1
            self.rewards[chosen_arm] += reward
            cumulative_reward += reward
            
            self.chosen_arms.append(chosen_arm)
            self.cumulative_rewards.append(cumulative_reward)
            
            # Print iteration details
            if t % 100 == 0:
                print(f"Iteration {t}: Chose Movie {movie_id}, Reward: {reward}, "
                      f"Cumulative Reward: {cumulative_reward}")
        
        return self.chosen_arms, self.cumulative_rewards

    def random_policy(self) -> int:
        return np.random.randint(self.n_arms)

    def greedy_policy(self) -> int:
        estimates = np.zeros(self.n_arms)
        for arm in range(self.n_arms):
            if self.counts[arm] > 0:
                estimates[arm] = self.rewards[arm] / self.counts[arm]
            else:
                estimates[arm] = float('inf')  # Encourage trying untested arms
        return np.argmax(estimates)

    def epsilon_greedy_policy(self, epsilon: float) -> int:
        if np.random.random() < epsilon:
            return self.random_policy()
        return self.greedy_policy()

    def ucb_policy(self) -> int:
        t = sum(self.counts) + 1
        ucb_values = np.zeros(self.n_arms)
        
        for arm in range(self.n_arms):
            if self.counts[arm] > 0:
                mean_reward = self.rewards[arm] / self.counts[arm]
                confidence_bound = math.sqrt(2 * math.log(t) / self.counts[arm])
                ucb_values[arm] = mean_reward + confidence_bound
            else:
                ucb_values[arm] = float('inf')
        
        return np.argmax(ucb_values)

In [3]:

def evaluate_policies(recommender: MovieRecommender, n_iterations: int) -> dict:
    # Set random seed for reproducibility
    np.random.seed(42)  # Using 42 as seed - you can change this to any number
    
    results = {}
    
    # Random policy
    _, random_rewards = recommender.simulate_policy(
        recommender.random_policy, n_iterations
    )
    results['Random'] = random_rewards
    
    # Greedy policy
    _, greedy_rewards = recommender.simulate_policy(
        recommender.greedy_policy, n_iterations
    )
    results['Greedy'] = greedy_rewards
    
    # Epsilon-greedy policies with different epsilon values
    epsilon_values = [0.1, 0.2, 0.5]
    for epsilon in epsilon_values:
        _, rewards = recommender.simulate_policy(
            recommender.epsilon_greedy_policy, n_iterations, epsilon=epsilon
        )
        results[f'ε-Greedy (ε={epsilon})'] = rewards
    
    # UCB policy
    _, ucb_rewards = recommender.simulate_policy(
        recommender.ucb_policy, n_iterations
    )
    results['UCB'] = ucb_rewards
    
    # Print final results
    print("\nFinal Results:")
    for policy, rewards in results.items():
        print(f"{policy} Final Reward: {rewards[-1]}")
    
    return results

def plot_cumulative_rewards_plotly(results: dict, n_iterations: int) -> go.Figure:
    colors = {
        'Random': 'gray',
        'Greedy': 'blue',
        'UCB': 'red',
        'ε-Greedy (ε=0.1)': 'green',
        'ε-Greedy (ε=0.2)': 'purple',
        'ε-Greedy (ε=0.5)': 'orange'
    }
    
    fig = go.Figure()
    
    for policy, rewards in results.items():
        fig.add_trace(
            go.Scatter(
                x=list(range(n_iterations)),
                y=rewards,
                name=policy,
                line=dict(color=colors[policy], width=2),
                hovertemplate="Iteration: %{x}<br>Reward: %{y:.0f}<extra></extra>"
            )
        )
    
    fig.update_layout(
        title={
            'text': 'Cumulative Rewards Comparison Across Different Policies',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=20)
        },
        xaxis_title={
            'text': 'Iteration',
            'font': dict(size=14)
        },
        yaxis_title={
            'text': 'Cumulative Reward',
            'font': dict(size=14)
        },
        hovermode='x unified',
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.05,
            font=dict(size=12)
        ),
        showlegend=True,
        margin=dict(r=150),
        plot_bgcolor='white',
        width=1000,
        height=600
    )
    
    fig.update_xaxes(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray',
        zeroline=True,
        zerolinewidth=1,
        zerolinecolor='lightgray'
    )
    
    fig.update_yaxes(
        showgrid=True,
        gridwidth=1,
        gridcolor='lightgray',
        zeroline=True,
        zerolinewidth=1,
        zerolinecolor='lightgray',
        tickformat=',.0f'
    )
    
    return fig

def analyze_performance(results: dict) -> str:
    # Find best performing policy
    final_rewards = {policy: rewards[-1] for policy, rewards in results.items()}
    best_policy = max(final_rewards.items(), key=lambda x: x[1])[0]
    
    # Calculate improvement percentages over random
    random_reward = final_rewards['Random']
    improvements = {
        policy: ((reward - random_reward) / random_reward) * 100 
        for policy, reward in final_rewards.items()
    }
    
    # Analyze ε-greedy variants specifically
    epsilon_policies = {k: v for k, v in final_rewards.items() if 'ε-Greedy' in k}
    best_epsilon_policy = max(epsilon_policies.items(), key=lambda x: x[1])[0]
    best_epsilon = float(best_epsilon_policy.split('=')[1][:-1])  # Extract epsilon value
    
    conclusion = f"""
        Performance Analysis of Movie Recommendation Policies:

        The {best_policy} policy emerged as the most effective strategy, achieving the highest cumulative reward of {final_rewards[best_policy]:.1f}.

        Key findings:
        1. The random policy (baseline) achieved a cumulative reward of {final_rewards['Random']:.1f}.
        2. The pure greedy policy achieved {final_rewards['Greedy']:.1f}, showing a {improvements['Greedy']:.1f}% improvement over random.
        3. Among the ε-greedy variants:"""

    # Add detailed ε-greedy comparison
    for policy in sorted(epsilon_policies.keys()):
        conclusion += f"\n   - {policy}: {final_rewards[policy]:.1f} ({improvements[policy]:.1f}% improvement over random)"
    
    conclusion += f"\n   The best performing ε-greedy variant was with ε={best_epsilon}, suggesting this value provides the optimal exploration-exploitation trade-off for this dataset."
    
    conclusion += f"""
        4. The UCB policy achieved {final_rewards['UCB']:.1f} ({improvements['UCB']:.1f}% improvement over random).

        The results demonstrate that {best_policy} provides the best balance between exploration and exploitation for this specific recommendation task. The performance differences between policies highlight the importance of choosing an appropriate strategy based on the specific requirements of the recommendation system.

        For practical implementation at TrendMovie Inc., the {best_policy} would be recommended, as it demonstrated superior performance in maximizing cumulative user satisfaction while maintaining an appropriate balance between exploring new options and exploiting known preferences.
        """
    return conclusion

In [4]:

def main():
    """Main execution function"""
    # Initialize recommender with dataset
    data_path = "TrendMovie.csv"  # Update with actual path
    recommender = MovieRecommender(data_path)
    
    # Run simulations and get results
    results = evaluate_policies(recommender, n_iterations=1000)
    
    # Create and display interactive plot
    fig = plot_cumulative_rewards_plotly(results, n_iterations=1000)
    fig.show()
    # Save the plot as an HTML file for interactivity
    fig.write_html("policy_comparison_interactive.html")
    
    # Generate and print analysis
    conclusion = analyze_performance(results)
    print("\nPerformance Analysis:")
    print(conclusion)

if __name__ == "__main__":
    main()


Running random_policy for 1000 iterations:
Iteration 0: Chose Movie 4191, Reward: 1, Cumulative Reward: 1
Iteration 100: Chose Movie 100556, Reward: 1, Cumulative Reward: 48
Iteration 200: Chose Movie 3389, Reward: 0, Cumulative Reward: 81
Iteration 300: Chose Movie 6539, Reward: 1, Cumulative Reward: 116
Iteration 400: Chose Movie 6598, Reward: 0, Cumulative Reward: 146
Iteration 500: Chose Movie 2052, Reward: 0, Cumulative Reward: 180
Iteration 600: Chose Movie 8874, Reward: 1, Cumulative Reward: 213
Iteration 700: Chose Movie 3593, Reward: 0, Cumulative Reward: 265
Iteration 800: Chose Movie 1140, Reward: 1, Cumulative Reward: 304
Iteration 900: Chose Movie 91873, Reward: 0, Cumulative Reward: 343

Running greedy_policy for 1000 iterations:
Iteration 0: Chose Movie 1, Reward: 1, Cumulative Reward: 1
Iteration 100: Chose Movie 1573, Reward: 0, Cumulative Reward: 57
Iteration 200: Chose Movie 3052, Reward: 1, Cumulative Reward: 103
Iteration 300: Chose Movie 171, Reward: 1, Cumulativ


Performance Analysis:

        Performance Analysis of Movie Recommendation Policies:

        The ε-Greedy (ε=0.5) policy emerged as the most effective strategy, achieving the highest cumulative reward of 470.0.

        Key findings:
        1. The random policy (baseline) achieved a cumulative reward of 377.0.
        2. The pure greedy policy achieved 444.0, showing a 17.8% improvement over random.
        3. Among the ε-greedy variants:
   - ε-Greedy (ε=0.1): 429.0 (13.8% improvement over random)
   - ε-Greedy (ε=0.2): 417.0 (10.6% improvement over random)
   - ε-Greedy (ε=0.5): 470.0 (24.7% improvement over random)
   The best performing ε-greedy variant was with ε=0.5, suggesting this value provides the optimal exploration-exploitation trade-off for this dataset.
        4. The UCB policy achieved 438.0 (16.2% improvement over random).

        The results demonstrate that ε-Greedy (ε=0.5) provides the best balance between exploration and exploitation for this specific recommen

In [5]:
# small eps -> focus more on exploitation, and less exploration