# 04 - Model Building: Recommendation Systems

Notebook này xây dựng và train các mô hình recommendation.

## Mục Tiêu
- Build Content-Based Filtering models
- Build Collaborative Filtering models (Item-based & User-based)
- Compare different approaches
- Save trained models

## Models
1. **Content-Based**: Sử dụng features của phim (genres, TF-IDF, etc.)
2. **Collaborative Filtering (Item-based)**: Dựa trên similarity giữa các phim qua ratings
3. **Collaborative Filtering (User-based)**: Dựa trên similarity giữa các users

## 1. Import Libraries

In [None]:
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from models.content_based import ContentBasedRecommender
from models.collaborative_filtering import CollaborativeFilteringRecommender

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Libraries imported successfully")
print(f"Timestamp: {datetime.now()}")

## 2. Load Data

In [None]:
# Data paths
data_dir = '../data/processed'
models_dir = '../data/models'

# Create models directory
os.makedirs(models_dir, exist_ok=True)

movies_path = f'{data_dir}/movies_enriched.csv'
ratings_path = f'{data_dir}/ratings.csv'
tfidf_path = f'{data_dir}/tfidf_matrix.pkl'

# Load for quick inspection
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

print(f"Loaded {len(movies)} movies, {len(ratings)} ratings")
print(f"Movies shape: {movies.shape}")
print(f"Ratings shape: {ratings.shape}")

## 3. Content-Based Filtering

### 3.1 Model 1: TF-IDF Based

In [None]:
print("=" * 70)
print("CONTENT-BASED MODEL 1: TF-IDF Features")
print("=" * 70)

# Initialize
cb_tfidf = ContentBasedRecommender(verbose=True)

# Load data
cb_tfidf.load_data(movies_path, tfidf_path)

# Compute similarity (TF-IDF matrix already loaded)
cb_tfidf.compute_similarity()

# Save model
cb_tfidf.save_model(f'{models_dir}/content_based_tfidf.pkl')

print("\nContent-Based (TF-IDF) model trained successfully!")

### 3.2 Model 2: Genre-Based

In [None]:
print("=" * 70)
print("CONTENT-BASED MODEL 2: Genre Features")
print("=" * 70)

# Initialize
cb_genre = ContentBasedRecommender(verbose=True)

# Load data
cb_genre.load_data(movies_path)

# Build genre features
cb_genre.build_genre_features()

# Compute similarity
cb_genre.compute_similarity()

# Save model
cb_genre.save_model(f'{models_dir}/content_based_genre.pkl')

print("\nContent-Based (Genre) model trained successfully!")

### 3.3 Model 3: Combined Features

In [None]:
print("=" * 70)
print("CONTENT-BASED MODEL 3: Combined Features")
print("=" * 70)

# Initialize
cb_combined = ContentBasedRecommender(verbose=True)

# Load data
cb_combined.load_data(movies_path)

# Build combined features (genres + numeric)
cb_combined.build_combined_features(
    numeric_cols=['year', 'avg_rating', 'popularity', 'genres_count', 'movie_age'],
    genre_weight=2.0,
    numeric_weight=1.0
)

# Compute similarity
cb_combined.compute_similarity()

# Save model
cb_combined.save_model(f'{models_dir}/content_based_combined.pkl')

print("\nContent-Based (Combined) model trained successfully!")

### 3.4 Test Content-Based Models

In [None]:
print("=" * 70)
print("TESTING CONTENT-BASED MODELS")
print("=" * 70)

test_movie = "Toy Story"

print(f"\nGetting recommendations for: {test_movie}\n")

# TF-IDF based
print("\n" + "=" * 70)
print("TF-IDF Based Recommendations:")
print("=" * 70)
recs_tfidf = cb_tfidf.get_recommendations_by_title(test_movie, n=5)
print(recs_tfidf[['title', 'year', 'genres', 'similarity_score']])

# Genre based
print("\n" + "=" * 70)
print("Genre Based Recommendations:")
print("=" * 70)
recs_genre = cb_genre.get_recommendations_by_title(test_movie, n=5)
print(recs_genre[['title', 'year', 'genres', 'similarity_score']])

# Combined
print("\n" + "=" * 70)
print("Combined Features Recommendations:")
print("=" * 70)
recs_combined = cb_combined.get_recommendations_by_title(test_movie, n=5)
print(recs_combined[['title', 'year', 'genres', 'similarity_score']])

## 4. Collaborative Filtering

### 4.1 Item-Based Collaborative Filtering

In [None]:
print("=" * 70)
print("COLLABORATIVE FILTERING: Item-Based")
print("=" * 70)

# Initialize
cf_item = CollaborativeFilteringRecommender(approach='item', verbose=True)

# Load data
cf_item.load_data(ratings_path, movies_path)

# Fit model
print("\nFitting item-based CF model...")
print("(This may take a few minutes)\n")
cf_item.fit(min_ratings_per_user=20, min_ratings_per_movie=50)

# Save model
cf_item.save_model(f'{models_dir}/collaborative_item_based.pkl')

print("\nItem-Based CF model trained successfully!")

### 4.2 User-Based Collaborative Filtering

In [None]:
print("=" * 70)
print("COLLABORATIVE FILTERING: User-Based")
print("=" * 70)

# Initialize
cf_user = CollaborativeFilteringRecommender(approach='user', verbose=True)

# Load data
cf_user.load_data(ratings_path, movies_path)

# Fit model
print("\nFitting user-based CF model...")
print("(This may take a few minutes)\n")
cf_user.fit(min_ratings_per_user=20, min_ratings_per_movie=50)

# Save model
cf_user.save_model(f'{models_dir}/collaborative_user_based.pkl')

print("\nUser-Based CF model trained successfully!")

### 4.3 Test Collaborative Filtering Models

In [None]:
print("=" * 70)
print("TESTING COLLABORATIVE FILTERING MODELS")
print("=" * 70)

# Test item-based
print("\n" + "=" * 70)
print("Item-Based CF Recommendations (for Toy Story):")
print("=" * 70)

try:
    toy_story_id = 1
    recs_cf_item = cf_item.get_item_based_recommendations(toy_story_id, n=5)
    print(recs_cf_item[['title', 'year', 'genres', 'similarity_score']])
except ValueError as e:
    print(f"Note: {e}")
    print("Movie may not have enough ratings in the filtered dataset.")

# Test user-based
print("\n" + "=" * 70)
print("User-Based CF Recommendations (for User ID 1):")
print("=" * 70)

try:
    test_user_id = 1
    recs_cf_user = cf_user.get_user_based_recommendations(test_user_id, n=5)
    print(recs_cf_user[['title', 'year', 'genres', 'predicted_rating']])
except ValueError as e:
    print(f"Note: {e}")
    print("User may not be in the filtered dataset.")

## 5. Model Comparison

In [None]:
print("=" * 70)
print("MODEL COMPARISON")
print("=" * 70)

# Compare recommendations for the same movie
comparison_movie = "Toy Story"
comparison_id = 1

print(f"\nComparing recommendations for: {comparison_movie}\n")

results = {}

# Content-Based (Combined)
try:
    cb_recs = cb_combined.get_recommendations_by_title(comparison_movie, n=10)
    results['Content-Based'] = set(cb_recs['title'].tolist())
    print(f"Content-Based (Combined): {len(cb_recs)} recommendations")
except:
    print("Content-Based: Failed")

# Item-Based CF
try:
    cf_recs = cf_item.get_item_based_recommendations(comparison_id, n=10)
    results['Item-Based CF'] = set(cf_recs['title'].tolist())
    print(f"Item-Based CF: {len(cf_recs)} recommendations")
except:
    print("Item-Based CF: Failed (movie may not have enough ratings)")

# Analyze overlap
if len(results) == 2:
    overlap = results['Content-Based'] & results['Item-Based CF']
    print(f"\nOverlap between methods: {len(overlap)} movies")
    if len(overlap) > 0:
        print(f"Common recommendations: {list(overlap)[:5]}")

## 6. Model Analysis & Insights

In [None]:
print("=" * 70)
print("MODEL ANALYSIS")
print("=" * 70)

print("\n1. CONTENT-BASED MODELS:")
print("   - Strengths: Works for new items, explainable, no cold-start for items")
print("   - Weaknesses: Limited by features, may create filter bubbles")
print(f"   - Feature matrix size: {cb_combined.feature_matrix.shape}")
print(f"   - Similarity matrix size: {cb_combined.similarity_matrix.shape}")

print("\n2. COLLABORATIVE FILTERING (Item-Based):")
print("   - Strengths: Finds unexpected patterns, quality recommendations")
print("   - Weaknesses: Cold-start problem, needs interaction data")
print(f"   - User-item matrix shape: {cf_item.user_item_matrix.shape}")
print(f"   - Sparsity: {(cf_item.user_item_matrix.values == 0).sum() / cf_item.user_item_matrix.size * 100:.2f}%")

print("\n3. COLLABORATIVE FILTERING (User-Based):")
print("   - Strengths: Personalized, finds similar users")
print("   - Weaknesses: Scalability issues, needs active users")
print(f"   - User-item matrix shape: {cf_user.user_item_matrix.shape}")
print(f"   - Sparsity: {(cf_user.user_item_matrix.values == 0).sum() / cf_user.user_item_matrix.size * 100:.2f}%")

## 7. Save Model Metadata

In [None]:
# Save model metadata
metadata = {
    'training_date': str(datetime.now()),
    'models': {
        'content_based_tfidf': {
            'path': f'{models_dir}/content_based_tfidf.pkl',
            'type': 'Content-Based',
            'features': 'TF-IDF (200 features)',
            'num_movies': len(movies)
        },
        'content_based_genre': {
            'path': f'{models_dir}/content_based_genre.pkl',
            'type': 'Content-Based',
            'features': 'Genre Binary Features',
            'num_movies': len(movies)
        },
        'content_based_combined': {
            'path': f'{models_dir}/content_based_combined.pkl',
            'type': 'Content-Based',
            'features': 'Genre + Numeric Features',
            'num_movies': len(movies)
        },
        'collaborative_item_based': {
            'path': f'{models_dir}/collaborative_item_based.pkl',
            'type': 'Collaborative Filtering (Item-Based)',
            'matrix_shape': str(cf_item.user_item_matrix.shape),
            'sparsity': f"{(cf_item.user_item_matrix.values == 0).sum() / cf_item.user_item_matrix.size * 100:.2f}%"
        },
        'collaborative_user_based': {
            'path': f'{models_dir}/collaborative_user_based.pkl',
            'type': 'Collaborative Filtering (User-Based)',
            'matrix_shape': str(cf_user.user_item_matrix.shape),
            'sparsity': f"{(cf_user.user_item_matrix.values == 0).sum() / cf_user.user_item_matrix.size * 100:.2f}%"
        }
    }
}

import json

metadata_path = f'{models_dir}/models_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Model metadata saved to: {metadata_path}")
print("\nMetadata:")
print(json.dumps(metadata, indent=2))

## 8. Summary

In [None]:
print("=" * 70)
print("MODEL BUILDING SUMMARY")
print("=" * 70)

print("\nMODELS TRAINED:")
print("\n1. Content-Based Filtering:")
print("   - TF-IDF based model")
print("   - Genre based model")
print("   - Combined features model")

print("\n2. Collaborative Filtering:")
print("   - Item-based model")
print("   - User-based model")

print("\nSAVED MODELS:")
for model_name, model_info in metadata['models'].items():
    print(f"   - {model_name}: {model_info['path']}")

print("\n" + "=" * 70)
print("MODEL BUILDING COMPLETED SUCCESSFULLY!")
print("=" * 70)