
### Collaborative Filtering Deep Dive

"""
### Collaborative Filtering: From Neighbors to Latent Factors
==========================================================
This notebook implements and analyzes collaborative filtering methods:
- User-based CF with different similarity metrics
- Item-based CF 
- Matrix Factorization (SVD)
- Alternating Least Squares (ALS)
- Hyperparameter tuning
- Detailed comparison and analysis
"""

In [None]:
# ============================================================================
# Cell 1: Imports and Setup
# ============================================================================
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import time
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import MovieLensLoader
from src.preprocess import prepare_data_for_training
from src.recommenders.collaborative import (
    UserBasedCF,
    ItemBasedCF,
    MatrixFactorizationSVD,
    AlternatingLeastSquares
)
from src.evaluation import RecommenderEvaluator
from src.utils import plot_model_comparison

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Imports successful")



In [None]:

# ============================================================================
# Cell 2: Load and Prepare Data
# ============================================================================
print("Loading MovieLens data...")
loader = MovieLensLoader()
ratings = loader.load_ratings()
movies = loader.load_movies()

# Prepare data with user-based split (ensures all users in both sets)
print("\nPreparing train/test split...")
train, test, metadata = prepare_data_for_training(
    ratings,
    test_size=0.2,
    split_method='user_based',
    min_user_ratings=10,
    min_item_ratings=5,
    random_state=42
)

all_items = set(ratings['item_id'].unique())

print("\n✓ Data loaded")
print(f"  Train: {len(train):,} ratings")
print(f"  Test: {len(test):,} ratings")
print(f"  Users: {train['user_id'].nunique():,}")
print(f"  Items: {train['item_id'].nunique():,}")
print(f"  Sparsity: {metadata['sparsity']:.4f}")


In [None]:
# ============================================================================
# Cell 3: Understanding the Data Matrix
# ============================================================================
print("\n" + "="*70)
print("UNDERSTANDING THE USER-ITEM MATRIX")
print("="*70)

# Create sample user-item matrix for visualization
sample_users = train['user_id'].unique()[:20]
sample_items = train['item_id'].unique()[:30]

sample_data = train[
    train['user_id'].isin(sample_users) & 
    train['item_id'].isin(sample_items)
]

sample_matrix = sample_data.pivot_table(
    index='user_id',
    columns='item_id',
    values='rating',
    fill_value=0
)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Raw ratings
sns.heatmap(
    sample_matrix,
    cmap='YlOrRd',
    cbar_kws={'label': 'Rating'},
    ax=axes[0],
    xticklabels=False,
    yticklabels=True
)
axes[0].set_title('User-Item Rating Matrix (Sample)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Items')
axes[0].set_ylabel('Users')

# Binary (rated/not rated)
sns.heatmap(
    (sample_matrix > 0).astype(int),
    cmap='RdYlGn',
    cbar_kws={'label': 'Rated'},
    ax=axes[1],
    xticklabels=False,
    yticklabels=True
)
axes[1].set_title('Interaction Pattern (Binary)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Items')
axes[1].set_ylabel('Users')

plt.tight_layout()
plt.show()

print(f"\nSample matrix density: {(sample_matrix > 0).sum().sum() / sample_matrix.size:.4f}")


In [None]:

# ============================================================================
# Cell 4: User-Based Collaborative Filtering
# ============================================================================
print("\n" + "="*70)
print("USER-BASED COLLABORATIVE FILTERING")
print("="*70)

print("\n1. Training User-Based CF with Cosine Similarity...")
user_cf_cosine = UserBasedCF(k=50, similarity='cosine')
start_time = time.time()
user_cf_cosine.fit(train)
train_time = time.time() - start_time
print(f"   Training time: {train_time:.2f}s")

# Test prediction
pred = user_cf_cosine.predict(user_id=1, item_id=100)
print(f"\nSample prediction (user=1, item=100): {pred:.2f}")

# Generate recommendations
print("\nTop 10 recommendations for User 1:")
recs = user_cf_cosine.recommend(user_id=1, n=10, exclude_seen=True)
for i, (item_id, score) in enumerate(recs, 1):
    movie_title = movies[movies['item_id'] == item_id]['title'].values
    title = movie_title[0] if len(movie_title) > 0 else "Unknown"
    print(f"  {i:2d}. {title[:50]:50s} (score: {score:.3f})")

# Compare with Pearson similarity
print("\n2. Training User-Based CF with Pearson Correlation...")
user_cf_pearson = UserBasedCF(k=50, similarity='pearson')
user_cf_pearson.fit(train)

# Visualize user similarity matrix (sample)
print("\n3. User Similarity Analysis")
sample_sim = user_cf_cosine.user_similarity[:50, :50]

plt.figure(figsize=(12, 10))
sns.heatmap(
    sample_sim,
    cmap='coolwarm',
    center=0,
    vmin=-1,
    vmax=1,
    cbar_kws={'label': 'Similarity'},
    xticklabels=False,
    yticklabels=False
)
plt.title('User Similarity Matrix (Sample - First 50 Users)', fontsize=14, fontweight='bold')
plt.xlabel('Users')
plt.ylabel('Users')
plt.tight_layout()
plt.show()

# Analyze similarity distribution
sim_values = user_cf_cosine.user_similarity[np.triu_indices_from(user_cf_cosine.user_similarity, k=1)]
plt.figure(figsize=(10, 6))
plt.hist(sim_values, bins=50, color='skyblue', edgecolor='black', alpha=0.7)
plt.axvline(sim_values.mean(), color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {sim_values.mean():.3f}')
plt.title('Distribution of User Similarities', fontsize=14, fontweight='bold')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nSimilarity statistics:")
print(f"  Mean: {sim_values.mean():.4f}")
print(f"  Std: {sim_values.std():.4f}")
print(f"  Min: {sim_values.min():.4f}")
print(f"  Max: {sim_values.max():.4f}")


In [None]:
# ============================================================================
# Cell 5: Item-Based Collaborative Filtering
# ============================================================================
print("\n" + "="*70)
print("ITEM-BASED COLLABORATIVE FILTERING")
print("="*70)

print("\n1. Training Item-Based CF...")
item_cf = ItemBasedCF(k=50, similarity='cosine')
start_time = time.time()
item_cf.fit(train)
train_time = time.time() - start_time
print(f"   Training time: {train_time:.2f}s")

# Generate recommendations
print("\nTop 10 recommendations for User 1:")
recs = item_cf.recommend(user_id=1, n=10, exclude_seen=True)
for i, (item_id, score) in enumerate(recs, 1):
    movie_title = movies[movies['item_id'] == item_id]['title'].values
    title = movie_title[0] if len(movie_title) > 0 else "Unknown"
    print(f"  {i:2d}. {title[:50]:50s} (score: {score:.3f})")

# Find similar items
print("\n2. Similar Items Analysis")
sample_item = 1  # Toy Story
movie_title = movies[movies['item_id'] == sample_item]['title'].values[0]
print(f"\nMovies similar to '{movie_title}':")

# Get item index
item_idx = item_cf.item_id_map[sample_item]
similarities = item_cf.item_similarity[item_idx, :]
top_indices = np.argsort(similarities)[-11:-1][::-1]  # Top 10 (excluding itself)

for rank, idx in enumerate(top_indices, 1):
    similar_item_id = item_cf.reverse_item_map[idx]
    sim_score = similarities[idx]
    similar_title = movies[movies['item_id'] == similar_item_id]['title'].values
    title = similar_title[0] if len(similar_title) > 0 else "Unknown"
    print(f"  {rank:2d}. {title[:50]:50s} (similarity: {sim_score:.3f})")

# Visualize item similarity matrix (sample)
sample_item_sim = item_cf.item_similarity[:50, :50]

plt.figure(figsize=(12, 10))
sns.heatmap(
    sample_item_sim,
    cmap='coolwarm',
    center=0,
    vmin=-1,
    vmax=1,
    cbar_kws={'label': 'Similarity'},
    xticklabels=False,
    yticklabels=False
)
plt.title('Item Similarity Matrix (Sample - First 50 Items)', fontsize=14, fontweight='bold')
plt.xlabel('Items')
plt.ylabel('Items')
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Cell 6: Matrix Factorization (SVD)
# ============================================================================
print("\n" + "="*70)
print("MATRIX FACTORIZATION (SVD)")
print("="*70)

print("\n1. Training SVD with 50 factors...")
svd_model = MatrixFactorizationSVD(n_factors=50, random_state=42)
start_time = time.time()
svd_model.fit(train)
train_time = time.time() - start_time
print(f"   Training time: {train_time:.2f}s")

# Analyze latent factors
print("\n2. Latent Factor Analysis")
print(f"   User factors shape: {svd_model.user_factors.shape}")
print(f"   Item factors shape: {svd_model.item_factors.shape}")
print(f"   Singular values: {svd_model.sigma}")

# Visualize singular values
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(svd_model.sigma) + 1), svd_model.sigma, 
         marker='o', linewidth=2, markersize=8)
plt.title('Singular Values (Importance of Latent Factors)', fontsize=14, fontweight='bold')
plt.xlabel('Factor Index')
plt.ylabel('Singular Value')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Visualize user and item factors
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# User factors (first 2 dimensions)
axes[0].scatter(
    svd_model.user_factors[:, 0],
    svd_model.user_factors[:, 1],
    alpha=0.5,
    s=20
)
axes[0].set_title('User Embeddings (First 2 Dimensions)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Factor 1')
axes[0].set_ylabel('Factor 2')
axes[0].grid(alpha=0.3)

# Item factors (first 2 dimensions)
axes[1].scatter(
    svd_model.item_factors[:, 0],
    svd_model.item_factors[:, 1],
    alpha=0.5,
    s=20,
    color='coral'
)
axes[1].set_title('Item Embeddings (First 2 Dimensions)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Factor 1')
axes[1].set_ylabel('Factor 2')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Generate recommendations
print("\nTop 10 recommendations for User 1:")
recs = svd_model.recommend(user_id=1, n=10, exclude_seen=True)
for i, (item_id, score) in enumerate(recs, 1):
    movie_title = movies[movies['item_id'] == item_id]['title'].values
    title = movie_title[0] if len(movie_title) > 0 else "Unknown"
    print(f"  {i:2d}. {title[:50]:50s} (score: {score:.3f})")


In [None]:
# ============================================================================
# Cell 7: Alternating Least Squares (ALS)
# ============================================================================
print("\n" + "="*70)
print("ALTERNATING LEAST SQUARES (ALS)")
print("="*70)

print("\n1. Training ALS...")
als_model = AlternatingLeastSquares(
    n_factors=20,
    regularization=0.01,
    iterations=15,
    random_state=42
)
start_time = time.time()
als_model.fit(train)
train_time = time.time() - start_time
print(f"   Total training time: {train_time:.2f}s")

# Generate recommendations
print("\nTop 10 recommendations for User 1:")
recs = als_model.recommend(user_id=1, n=10, exclude_seen=True)
for i, (item_id, score) in enumerate(recs, 1):
    movie_title = movies[movies['item_id'] == item_id]['title'].values
    title = movie_title[0] if len(movie_title) > 0 else "Unknown"
    print(f"  {i:2d}. {title[:50]:50s} (score: {score:.3f})")


In [None]:

# ============================================================================
# Cell 8: Hyperparameter Tuning - K Neighbors
# ============================================================================
print("\n" + "="*70)
print("HYPERPARAMETER TUNING: K NEIGHBORS")
print("="*70)

k_values = [10, 20, 30, 50, 75, 100]
results_by_k = []

evaluator = RecommenderEvaluator(k_values=[10])

print("\nTesting different K values for Item-Based CF...")
for k in k_values:
    print(f"\n  Testing k={k}...")
    model = ItemBasedCF(k=k, similarity='cosine')
    model.fit(train)
    
    results = evaluator.evaluate_model(
        model, test, train, all_items,
        n_recommendations=10,
        verbose=False
    )
    
    results_by_k.append({
        'k': k,
        'NDCG@10': results['NDCG@10'],
        'Precision@10': results['Precision@10'],
        'Recall@10': results['Recall@10'],
        'Coverage': results['Coverage']
    })

k_results_df = pd.DataFrame(results_by_k)
print("\nResults by K:")
print(k_results_df)

# Plot results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['NDCG@10', 'Precision@10', 'Recall@10', 'Coverage']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    ax.plot(k_results_df['k'], k_results_df[metric], 
            marker='o', linewidth=2, markersize=8)
    ax.set_title(f'{metric} vs K', fontsize=12, fontweight='bold')
    ax.set_xlabel('K (Number of Neighbors)')
    ax.set_ylabel(metric)
    ax.grid(alpha=0.3)
    
    # Mark best value
    best_idx = k_results_df[metric].idxmax() if metric != 'Coverage' else k_results_df[metric].idxmax()
    best_k = k_results_df.loc[best_idx, 'k']
    best_val = k_results_df.loc[best_idx, metric]
    ax.axvline(best_k, color='red', linestyle='--', alpha=0.5)
    ax.annotate(f'Best: k={int(best_k)}', 
                xy=(best_k, best_val),
                xytext=(10, 10), textcoords='offset points',
                bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

plt.tight_layout()
plt.show()


In [None]:
# ============================================================================
# Cell 9: Hyperparameter Tuning - Latent Factors
# ============================================================================
print("\n" + "="*70)
print("HYPERPARAMETER TUNING: LATENT FACTORS")
print("="*70)

factor_values = [10, 20, 30, 50, 75, 100]
results_by_factors = []

print("\nTesting different number of latent factors for SVD...")
for n_factors in factor_values:
    if n_factors > min(train['user_id'].nunique(), train['item_id'].nunique()) - 1:
        print(f"  Skipping n_factors={n_factors} (too large)")
        continue
        
    print(f"\n  Testing n_factors={n_factors}...")
    model = MatrixFactorizationSVD(n_factors=n_factors, random_state=42)
    model.fit(train)
    
    results = evaluator.evaluate_model(
        model, test, train, all_items,
        n_recommendations=10,
        verbose=False
    )
    
    results_by_factors.append({
        'n_factors': n_factors,
        'NDCG@10': results['NDCG@10'],
        'Precision@10': results['Precision@10'],
        'Recall@10': results['Recall@10'],
        'RMSE': results['RMSE']
    })

factors_results_df = pd.DataFrame(results_by_factors)
print("\nResults by Number of Factors:")
print(factors_results_df)

# Plot results
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics = ['NDCG@10', 'Precision@10', 'Recall@10', 'RMSE']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    ax.plot(factors_results_df['n_factors'], factors_results_df[metric], 
            marker='o', linewidth=2, markersize=8, color='coral')
    ax.set_title(f'{metric} vs Number of Factors', fontsize=12, fontweight='bold')
    ax.set_xlabel('Number of Latent Factors')
    ax.set_ylabel(metric)
    ax.grid(alpha=0.3)
    
    # Mark best value
    if metric == 'RMSE':
        best_idx = factors_results_df[metric].idxmin()
    else:
        best_idx = factors_results_df[metric].idxmax()
    best_factors = factors_results_df.loc[best_idx, 'n_factors']
    best_val = factors_results_df.loc[best_idx, metric]
    ax.axvline(best_factors, color='red', linestyle='--', alpha=0.5)
    ax.annotate(f'Best: {int(best_factors)}', 
                xy=(best_factors, best_val),
                xytext=(10, 10), textcoords='offset points',
                bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Cell 10: Comprehensive Model Comparison
# ============================================================================
print("\n" + "="*70)
print("COMPREHENSIVE MODEL COMPARISON")
print("="*70)

# Use best hyperparameters from tuning
best_k = k_results_df.loc[k_results_df['NDCG@10'].idxmax(), 'k']
best_factors = factors_results_df.loc[factors_results_df['NDCG@10'].idxmax(), 'n_factors']

print(f"\nUsing optimized hyperparameters:")
print(f"  Best K for neighborhood methods: {int(best_k)}")
print(f"  Best latent factors for MF: {int(best_factors)}")

# Train all models with best params
models = {}

print("\nTraining models...")
models['User-CF'] = UserBasedCF(k=int(best_k), similarity='cosine')
models['User-CF'].fit(train)

models['Item-CF'] = ItemBasedCF(k=int(best_k), similarity='cosine')
models['Item-CF'].fit(train)

models['SVD'] = MatrixFactorizationSVD(n_factors=int(best_factors), random_state=42)
models['SVD'].fit(train)

models['ALS'] = AlternatingLeastSquares(n_factors=20, regularization=0.01, iterations=10)
models['ALS'].fit(train)

# Evaluate all models
evaluator_full = RecommenderEvaluator(k_values=[5, 10, 20])
results_df = evaluator_full.compare_models(
    models=models,
    test_data=test,
    train_data=train,
    all_items=all_items,
    n_recommendations=10
)

print("\n" + "="*70)
print("FINAL RESULTS")
print("="*70)
print(results_df)

# Visualize comparison
key_metrics = ['Precision@10', 'Recall@10', 'NDCG@10', 'Coverage', 'RMSE']
plot_model_comparison(results_df, metrics=key_metrics)

In [None]:

# ============================================================================
# Cell 11: Recommendation Quality Analysis
# ============================================================================
print("\n" + "="*70)
print("RECOMMENDATION QUALITY ANALYSIS")
print("="*70)

# Analyze recommendations for a sample user
sample_user = 10
user_train_items = set(train[train['user_id'] == sample_user]['item_id'])

print(f"\nUser {sample_user} has rated {len(user_train_items)} items in training set")
print("\nMovies rated by user:")
for item_id in list(user_train_items)[:5]:
    movie_title = movies[movies['item_id'] == item_id]['title'].values
    title = movie_title[0] if len(movie_title) > 0 else "Unknown"
    rating = train[(train['user_id'] == sample_user) & (train['item_id'] == item_id)]['rating'].values[0]
    print(f"  • {title[:50]:50s} (rating: {rating})")

print("\nRecommendations from different models:")
for model_name, model in models.items():
    print(f"\n{model_name}:")
    recs = model.recommend(user_id=sample_user, n=5, exclude_seen=True, seen_items=user_train_items)
    for i, (item_id, score) in enumerate(recs, 1):
        movie_title = movies[movies['item_id'] == item_id]['title'].values
        title = movie_title[0] if len(movie_title) > 0 else "Unknown"
        print(f"  {i}. {title[:45]:45s} (score: {score:.3f})")



In [None]:
# ============================================================================
# Cell 12: Key Insights and Learnings
# ============================================================================
print("\n" + "="*70)
print("KEY INSIGHTS FROM COLLABORATIVE FILTERING")
print("="*70)

best_model = results_df['NDCG@10'].idxmax()
best_ndcg = results_df.loc[best_model, 'NDCG@10']

insights = f"""
🎯 PERFORMANCE SUMMARY:
   • Best model: {best_model} (NDCG@10: {best_ndcg:.4f})
   • Item-CF typically outperforms User-CF for sparse data
   • Matrix factorization provides good balance of accuracy and efficiency
   • Optimal K for neighborhoods: {int(best_k)}
   • Optimal latent factors: {int(best_factors)}

📊 USER-BASED vs ITEM-BASED CF:
   • Item-based is more stable (item similarities change less over time)
   • User-based can capture more diverse preferences
   • Item-based scales better with more users
   • User-based may perform better with dense data

🔬 MATRIX FACTORIZATION INSIGHTS:
   • SVD captures latent factors efficiently
   • More factors = more expressiveness but risk of overfitting
   • ALS handles implicit feedback well
   • Regularization is crucial for generalization

⚡ COMPUTATIONAL CONSIDERATIONS:
   • Neighborhood methods: O(n²) similarity computation
   • Matrix factorization: O(k·n·m) where k is iterations
   • Item-CF can precompute similarities (better for real-time)
   • SVD one-time computation, fast inference

🎭 COLD START HANDLING:
   • User-CF struggles with new users (no ratings history)
   • Item-CF struggles with new items (no rating history)
   • Matrix factorization cannot handle completely new users/items
   • Need content-based or hybrid approaches for cold start

🚀 PRODUCTION RECOMMENDATIONS:
   • Use Item-CF for interpretability and real-time updates
   • Use matrix factorization for batch recommendations
   • Consider hybrid approach for best of both worlds
   • Implement incremental updates for efficiency
"""

print(insights)



In [None]:
# ============================================================================
# Cell 13: Save Results and Models
# ============================================================================
print("\n" + "="*70)
print("SAVING RESULTS AND MODELS")
print("="*70)

import os
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Save evaluation results
results_df.to_csv('../data/processed/collaborative_filtering_results.csv')
k_results_df.to_csv('../data/processed/k_tuning_results.csv')
factors_results_df.to_csv('../data/processed/factors_tuning_results.csv')

print("✓ Results saved to data/processed/")

# Save best models
best_item_cf = models['Item-CF']
best_svd = models['SVD']

best_item_cf.save('../models/item_cf_best.pkl')
best_svd.save('../models/svd_best.pkl')

print("✓ Best models saved to models/")
print("\n✅ Collaborative filtering analysis complete!")