# 05 - Model Evaluation & Comparison

Notebook này đánh giá và so sánh các mô hình recommendation đã trained.

## Mục Tiêu
- Đánh giá 5 models: 3 Content-Based + 2 Collaborative Filtering
- Metrics: RMSE, MAE, Precision@K, Recall@K, F1@K, NDCG@K
- So sánh performance across models
- Identify best model cho từng use case
- Visualize kết quả

## Models được đánh giá
1. Content-Based TF-IDF
2. Content-Based Genre
3. Content-Based Combined
4. Collaborative Filtering Item-Based
5. Collaborative Filtering User-Based

## 1. Import Libraries

In [None]:
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from models.content_based import ContentBasedRecommender
from models.collaborative_filtering import CollaborativeFilteringRecommender
from evaluation.metrics import RecommendationMetrics

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Libraries imported successfully")
print(f"Timestamp: {datetime.now()}")

## 2. Load Data & Models

In [None]:
# Data paths
data_dir = '../data/processed'
models_dir = '../data/models'

movies_path = f'{data_dir}/movies_enriched.csv'
ratings_path = f'{data_dir}/ratings.csv'

# Load data
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

print(f"Loaded {len(movies)} movies, {len(ratings)} ratings")
print(f"Users: {ratings['userId'].nunique()}")
print(f"Movies with ratings: {ratings['movieId'].nunique()}")

In [None]:
print("=" * 70)
print("LOADING TRAINED MODELS")
print("=" * 70)

# Initialize models
cb_tfidf = ContentBasedRecommender(verbose=False)
cb_genre = ContentBasedRecommender(verbose=False)
cb_combined = ContentBasedRecommender(verbose=False)
cf_item = CollaborativeFilteringRecommender(approach='item', verbose=False)
cf_user = CollaborativeFilteringRecommender(approach='user', verbose=False)

# Load models
print("Loading Content-Based TF-IDF...")
cb_tfidf.load_model(f'{models_dir}/content_based_tfidf.pkl', movies_path)

print("Loading Content-Based Genre...")
cb_genre.load_model(f'{models_dir}/content_based_genre.pkl', movies_path)

print("Loading Content-Based Combined...")
cb_combined.load_model(f'{models_dir}/content_based_combined.pkl', movies_path)

print("Loading Collaborative Item-Based...")
cf_item.load_model(f'{models_dir}/collaborative_item_based.pkl', ratings_path, movies_path)

print("Loading Collaborative User-Based...")
cf_user.load_model(f'{models_dir}/collaborative_user_based.pkl', ratings_path, movies_path)

print("\nAll models loaded successfully!")

## 3. Prepare Test Data

In [None]:
print("=" * 70)
print("PREPARING TEST DATA")
print("=" * 70)

# Select test movies (popular movies for better evaluation)
popular_movies = movies.nlargest(100, 'num_ratings')
test_movie_ids = popular_movies['movieId'].tolist()[:20]

print(f"\nTest movies: {len(test_movie_ids)}")
print("\nSample test movies:")
print(movies[movies['movieId'].isin(test_movie_ids)][['movieId', 'title_clean', 'genres', 'avg_rating', 'num_ratings']].head(10))

## 4. Content-Based Models Evaluation

In [None]:
print("=" * 70)
print("CONTENT-BASED MODELS EVALUATION")
print("=" * 70)

# Initialize metrics
metrics = RecommendationMetrics(verbose=False)

# Storage for results
cb_results = {
    'TF-IDF': {'precision': [], 'recall': [], 'f1': [], 'ndcg': []},
    'Genre': {'precision': [], 'recall': [], 'f1': [], 'ndcg': []},
    'Combined': {'precision': [], 'recall': [], 'f1': [], 'ndcg': []}
}

k = 10  # Top-10 recommendations

print(f"\nEvaluating with K={k}...\n")

for movie_id in test_movie_ids:
    # Get movie's actual genre(s) as ground truth
    movie_info = movies[movies['movieId'] == movie_id].iloc[0]
    movie_genres = set(movie_info['genres'].split('|'))
    
    # Find relevant movies (same genre, high rating)
    relevant_movies = movies[
        (movies['avg_rating'] >= 4.0) &
        (movies['num_ratings'] >= 20) &
        (movies['movieId'] != movie_id)
    ]
    relevant_with_genre = relevant_movies[
        relevant_movies['genres'].apply(lambda x: len(set(x.split('|')) & movie_genres) > 0)
    ]
    relevant_ids = relevant_with_genre['movieId'].tolist()
    
    if len(relevant_ids) == 0:
        continue
    
    # Get recommendations from each model
    try:
        # TF-IDF
        recs_tfidf = cb_tfidf.get_recommendations(movie_id, n=k)
        recommended_tfidf = recs_tfidf['movieId'].tolist()
        cb_results['TF-IDF']['precision'].append(metrics.precision_at_k(recommended_tfidf, relevant_ids, k))
        cb_results['TF-IDF']['recall'].append(metrics.recall_at_k(recommended_tfidf, relevant_ids, k))
        cb_results['TF-IDF']['f1'].append(metrics.f1_at_k(recommended_tfidf, relevant_ids, k))
        cb_results['TF-IDF']['ndcg'].append(metrics.ndcg_at_k(recommended_tfidf, relevant_ids, k))
        
        # Genre
        recs_genre = cb_genre.get_recommendations(movie_id, n=k)
        recommended_genre = recs_genre['movieId'].tolist()
        cb_results['Genre']['precision'].append(metrics.precision_at_k(recommended_genre, relevant_ids, k))
        cb_results['Genre']['recall'].append(metrics.recall_at_k(recommended_genre, relevant_ids, k))
        cb_results['Genre']['f1'].append(metrics.f1_at_k(recommended_genre, relevant_ids, k))
        cb_results['Genre']['ndcg'].append(metrics.ndcg_at_k(recommended_genre, relevant_ids, k))
        
        # Combined
        recs_combined = cb_combined.get_recommendations(movie_id, n=k)
        recommended_combined = recs_combined['movieId'].tolist()
        cb_results['Combined']['precision'].append(metrics.precision_at_k(recommended_combined, relevant_ids, k))
        cb_results['Combined']['recall'].append(metrics.recall_at_k(recommended_combined, relevant_ids, k))
        cb_results['Combined']['f1'].append(metrics.f1_at_k(recommended_combined, relevant_ids, k))
        cb_results['Combined']['ndcg'].append(metrics.ndcg_at_k(recommended_combined, relevant_ids, k))
    except:
        continue

# Calculate averages
cb_summary = {}
for model_name, metrics_dict in cb_results.items():
    cb_summary[model_name] = {
        'Precision@10': np.mean(metrics_dict['precision']) if metrics_dict['precision'] else 0,
        'Recall@10': np.mean(metrics_dict['recall']) if metrics_dict['recall'] else 0,
        'F1@10': np.mean(metrics_dict['f1']) if metrics_dict['f1'] else 0,
        'NDCG@10': np.mean(metrics_dict['ndcg']) if metrics_dict['ndcg'] else 0
    }

# Display results
cb_df = pd.DataFrame(cb_summary).T
print("\nContent-Based Models Performance:")
print(cb_df)

print(f"\nEvaluated on {len(cb_results['TF-IDF']['precision'])} test cases")

## 5. Collaborative Filtering Evaluation

In [None]:
print("=" * 70)
print("COLLABORATIVE FILTERING EVALUATION")
print("=" * 70)

# For CF, we'll evaluate based on user-movie pairs
# Sample some users who have ratings
test_users = ratings['userId'].value_counts().head(20).index.tolist()

cf_results = {
    'Item-Based': {'precision': [], 'recall': [], 'f1': [], 'ndcg': []},
    'User-Based': {'precision': [], 'recall': [], 'f1': [], 'ndcg': []}
}

print(f"\nEvaluating {len(test_users)} users...\n")

for user_id in test_users:
    # Get user's highly rated movies as ground truth
    user_ratings = ratings[ratings['userId'] == user_id]
    relevant_movies = user_ratings[user_ratings['rating'] >= 4.0]['movieId'].tolist()
    
    if len(relevant_movies) < 3:
        continue
    
    try:
        # User-based recommendations
        recs_user = cf_user.get_user_based_recommendations(user_id, n=k)
        recommended_user = recs_user['movieId'].tolist()
        
        cf_results['User-Based']['precision'].append(metrics.precision_at_k(recommended_user, relevant_movies, k))
        cf_results['User-Based']['recall'].append(metrics.recall_at_k(recommended_user, relevant_movies, k))
        cf_results['User-Based']['f1'].append(metrics.f1_at_k(recommended_user, relevant_movies, k))
        cf_results['User-Based']['ndcg'].append(metrics.ndcg_at_k(recommended_user, relevant_movies, k))
    except:
        pass

# For item-based, test with popular movies
for movie_id in test_movie_ids:
    if movie_id not in cf_item.movie_id_to_idx:
        continue
    
    # Get similar movies as ground truth (same high-rated genre)
    movie_info = movies[movies['movieId'] == movie_id].iloc[0]
    movie_genres = set(movie_info['genres'].split('|'))
    
    relevant_movies = movies[
        (movies['avg_rating'] >= 4.0) &
        (movies['num_ratings'] >= 50) &
        (movies['movieId'] != movie_id)
    ]
    relevant_with_genre = relevant_movies[
        relevant_movies['genres'].apply(lambda x: len(set(x.split('|')) & movie_genres) > 0)
    ]
    relevant_ids = relevant_with_genre['movieId'].tolist()
    
    if len(relevant_ids) == 0:
        continue
    
    try:
        recs_item = cf_item.get_item_based_recommendations(movie_id, n=k)
        recommended_item = recs_item['movieId'].tolist()
        
        cf_results['Item-Based']['precision'].append(metrics.precision_at_k(recommended_item, relevant_ids, k))
        cf_results['Item-Based']['recall'].append(metrics.recall_at_k(recommended_item, relevant_ids, k))
        cf_results['Item-Based']['f1'].append(metrics.f1_at_k(recommended_item, relevant_ids, k))
        cf_results['Item-Based']['ndcg'].append(metrics.ndcg_at_k(recommended_item, relevant_ids, k))
    except:
        continue

# Calculate averages
cf_summary = {}
for model_name, metrics_dict in cf_results.items():
    cf_summary[model_name] = {
        'Precision@10': np.mean(metrics_dict['precision']) if metrics_dict['precision'] else 0,
        'Recall@10': np.mean(metrics_dict['recall']) if metrics_dict['recall'] else 0,
        'F1@10': np.mean(metrics_dict['f1']) if metrics_dict['f1'] else 0,
        'NDCG@10': np.mean(metrics_dict['ndcg']) if metrics_dict['ndcg'] else 0
    }

# Display results
cf_df = pd.DataFrame(cf_summary).T
print("\nCollaborative Filtering Performance:")
print(cf_df)

print(f"\nItem-Based evaluated on {len(cf_results['Item-Based']['precision'])} test cases")
print(f"User-Based evaluated on {len(cf_results['User-Based']['precision'])} test cases")

## 6. Combined Results & Comparison

In [None]:
print("=" * 70)
print("OVERALL MODEL COMPARISON")
print("=" * 70)

# Combine all results
all_results = {**cb_summary, **cf_summary}
all_df = pd.DataFrame(all_results).T

print("\nAll Models Performance:")
print(all_df)
print("\n")

# Identify best models
best_precision = all_df['Precision@10'].idxmax()
best_recall = all_df['Recall@10'].idxmax()
best_f1 = all_df['F1@10'].idxmax()
best_ndcg = all_df['NDCG@10'].idxmax()

print("Best Models:")
print(f"  Precision@10: {best_precision} ({all_df.loc[best_precision, 'Precision@10']:.4f})")
print(f"  Recall@10: {best_recall} ({all_df.loc[best_recall, 'Recall@10']:.4f})")
print(f"  F1@10: {best_f1} ({all_df.loc[best_f1, 'F1@10']:.4f})")
print(f"  NDCG@10: {best_ndcg} ({all_df.loc[best_ndcg, 'NDCG@10']:.4f})")

## 7. Visualization

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

metrics_to_plot = ['Precision@10', 'Recall@10', 'F1@10', 'NDCG@10']
colors = ['steelblue', 'coral', 'lightgreen', 'purple']

for idx, (metric, color) in enumerate(zip(metrics_to_plot, colors)):
    row = idx // 2
    col = idx % 2
    
    ax = axes[row, col]
    
    values = all_df[metric].values
    models = all_df.index.tolist()
    
    bars = ax.barh(range(len(models)), values, color=color, edgecolor='black')
    ax.set_yticks(range(len(models)))
    ax.set_yticklabels(models)
    ax.set_xlabel(metric)
    ax.set_title(f'{metric} Comparison')
    ax.grid(True, alpha=0.3, axis='x')
    ax.invert_yaxis()
    
    # Add value labels
    for i, (bar, value) in enumerate(zip(bars, values)):
        ax.text(value + max(values)*0.01, i, f'{value:.3f}', va='center')

plt.tight_layout()
plt.savefig('../reports/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nVisualization saved to reports/model_comparison.png")

In [None]:
# Radar chart for overall comparison
from math import pi

fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111, projection='polar')

categories = ['Precision@10', 'Recall@10', 'F1@10', 'NDCG@10']
N = len(categories)

 # Angle for each metric
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

# Plot each model
colors_radar = ['b', 'g', 'r', 'c', 'm']
for idx, (model_name, color) in enumerate(zip(all_df.index, colors_radar)):
    values = all_df.loc[model_name, categories].values.tolist()
    values += values[:1]
    
    ax.plot(angles, values, 'o-', linewidth=2, label=model_name, color=color)
    ax.fill(angles, values, alpha=0.15, color=color)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_ylim(0, 1)
ax.set_title('Model Performance Radar Chart', size=16, pad=20)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

plt.tight_layout()
plt.savefig('../reports/model_radar_chart.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nRadar chart saved to reports/model_radar_chart.png")

## 8. Summary & Insights

In [None]:
print("=" * 70)
print("EVALUATION SUMMARY & INSIGHTS")
print("=" * 70)

print("\n1. PERFORMANCE OVERVIEW:")
print(f"   Best overall model: {all_df.mean(axis=1).idxmax()}")
print(f"   Average score: {all_df.mean(axis=1).max():.4f}")

print("\n2. CONTENT-BASED INSIGHTS:")
print(f"   Top CB model: {all_df.loc[['TF-IDF', 'Genre', 'Combined']].mean(axis=1).idxmax()}")
print("   - TF-IDF: Good for text similarity")
print("   - Genre: Best for genre-based matching")
print("   - Combined: Balanced approach")

print("\n3. COLLABORATIVE FILTERING INSIGHTS:")
print(f"   Top CF model: {all_df.loc[['Item-Based', 'User-Based']].mean(axis=1).idxmax()}")
print("   - Item-Based: Better for item-to-item similarity")
print("   - User-Based: Better for personalization")

print("\n4. RECOMMENDATIONS:")
print("   - For new items: Use Content-Based (no cold-start)")
print("   - For established items: Use Collaborative Filtering (better quality)")
print("   - For hybrid approach: Combine best of both")

print("\n" + "=" * 70)
print("MODEL EVALUATION COMPLETED SUCCESSFULLY!")
print("=" * 70)

## 9. Save Evaluation Results

In [None]:
# Save results to CSV
all_df.to_csv('../reports/model_evaluation_results.csv')
print("Evaluation results saved to reports/model_evaluation_results.csv")

# Save summary
summary_text = f"""
MODEL EVALUATION SUMMARY

Best Models:
- Precision@10: {best_precision} ({all_df.loc[best_precision, 'Precision@10']:.4f})
- Recall@10: {best_recall} ({all_df.loc[best_recall, 'Recall@10']:.4f})
- F1@10: {best_f1} ({all_df.loc[best_f1, 'F1@10']:.4f})
- NDCG@10: {best_ndcg} ({all_df.loc[best_ndcg, 'NDCG@10']:.4f})

Overall Best Model: {all_df.mean(axis=1).idxmax()}

Full Results:
{all_df.to_string()}
"""

with open('../reports/evaluation_summary.txt', 'w') as f:
    f.write(summary_text)

print("Summary saved to reports/evaluation_summary.txt")
print("\nEvaluation complete! Check reports/ folder for detailed results.")