In [1]:
import pandas as pd
import numpy as np
import pickle
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')


### 1. Load Data

In [2]:
ratings = pd.read_csv('data/processed/processed_ratings.csv')
movies = pd.read_csv('data/processed/processed_movies.csv')

print(f"✓ Loaded {len(ratings):,} ratings")
print(f"✓ Loaded {len(movies):,} movies")


✓ Loaded 100,000 ratings
✓ Loaded 1,682 movies


### 2. User Item Matrix

In [3]:
user_item_matrix = ratings.pivot_table(
    index='user_id',
    columns='item_id',
    values='rating',
    fill_value=0
)

print(f"✓ Matrix shape: {user_item_matrix.shape}")
print(f"✓ Sparsity: {(1 - (ratings.shape[0] / (user_item_matrix.shape[0] * user_item_matrix.shape[1])))*100:.2f}%")

✓ Matrix shape: (943, 1682)
✓ Sparsity: 93.70%


### 3. Train-Test Split

In [4]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)
print(f"✓ Training set: {len(train_data):,} ratings")
print(f"✓ Test set: {len(test_data):,} ratings")

# Create train matrix
train_matrix = train_data.pivot_table(
    index='user_id',
    columns='item_id',
    values='rating',
    fill_value=0
)

✓ Training set: 80,000 ratings
✓ Test set: 20,000 ratings


### 4. Evaluation 

In [5]:
def precision_at_k(predictions, k=10):
    """Calculate Precision@K"""
    precisions = []
    
    for user_id in test_data['user_id'].unique():
        # Get actual rated items by user in test set
        actual = set(test_data[test_data['user_id'] == user_id]['item_id'].values)
        
        if user_id in predictions and len(actual) > 0:
            # Get top-k predicted items
            predicted = set(predictions[user_id][:k])
            
            # Calculate precision
            if len(predicted) > 0:
                precision = len(actual & predicted) / k
                precisions.append(precision)
    
    return np.mean(precisions) if precisions else 0

def calculate_metrics(predictions, k=10):
    """Calculate multiple metrics"""
    # Precision@K
    precision = precision_at_k(predictions, k)
    
    # Coverage (what % of items can be recommended)
    all_recommended = set()
    for recs in predictions.values():
        all_recommended.update(recs[:k])
    coverage = len(all_recommended) / movies['movie_id'].nunique()
    
    return {
        'precision@10': precision,
        'coverage': coverage,
        'avg_recommendations': np.mean([len(v) for v in predictions.values()])
    }


### 5. User Based Collaborative Filtering

In [6]:
start_time = time.time()

# Calculate user similarity
user_similarity = cosine_similarity(train_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=train_matrix.index,
    columns=train_matrix.index
)

def user_based_recommendations(user_id, n=10, n_neighbors=20):
    """Generate recommendations using user-based CF"""
    if user_id not in user_similarity_df.index:
        return []
    
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_neighbors+1]
    
    # Get items the user hasn't rated
    user_items = set(train_data[train_data['user_id'] == user_id]['item_id'].values)
    
    # Score items based on similar users' ratings
    item_scores = {}
    for sim_user, similarity in similar_users.items():
        sim_user_items = train_data[train_data['user_id'] == sim_user]
        for _, row in sim_user_items.iterrows():
            if row['item_id'] not in user_items:
                if row['item_id'] not in item_scores:
                    item_scores[row['item_id']] = 0
                item_scores[row['item_id']] += similarity * row['rating']
    
    # Sort and return top N
    recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [item_id for item_id, score in recommendations]

# Generate predictions for test users
user_based_predictions = {}
test_users = test_data['user_id'].unique()[:100]  # Sample for faster evaluation
for user_id in test_users:
    user_based_predictions[user_id] = user_based_recommendations(user_id)

training_time_ub = time.time() - start_time
metrics_ub = calculate_metrics(user_based_predictions)

print(f"✓ Training time: {training_time_ub:.2f} seconds")
print(f"✓ Precision@10: {metrics_ub['precision@10']:.4f}")
print(f"✓ Coverage: {metrics_ub['coverage']:.4f}")

✓ Training time: 11.70 seconds
✓ Precision@10: 0.4760
✓ Coverage: 0.1207


### 6. Item Based Collaborative Filtering

In [7]:
start_time = time.time()

# Calculate item similarity
item_similarity = cosine_similarity(train_matrix.T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=train_matrix.columns,
    columns=train_matrix.columns
)

def item_based_recommendations(user_id, n=10):
    """Generate recommendations using item-based CF"""
    # Get items the user has rated
    user_items = train_data[train_data['user_id'] == user_id]
    
    if len(user_items) == 0:
        return []
    
    # Score items based on similarity to user's rated items
    item_scores = {}
    for _, row in user_items.iterrows():
        item_id = row['item_id']
        if item_id in item_similarity_df.index:
            similar_items = item_similarity_df[item_id].sort_values(ascending=False)[1:51]
            
            for sim_item, similarity in similar_items.items():
                if sim_item not in user_items['item_id'].values:
                    if sim_item not in item_scores:
                        item_scores[sim_item] = 0
                    item_scores[sim_item] += similarity * row['rating']
    
    # Sort and return top N
    recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [item_id for item_id, score in recommendations]

# Generate predictions
item_based_predictions = {}
for user_id in test_users:
    item_based_predictions[user_id] = item_based_recommendations(user_id)

training_time_ib = time.time() - start_time
metrics_ib = calculate_metrics(item_based_predictions)

print(f"✓ Training time: {training_time_ib:.2f} seconds")
print(f"✓ Precision@10: {metrics_ib['precision@10']:.4f}")
print(f"✓ Coverage: {metrics_ib['coverage']:.4f}")


✓ Training time: 10.12 seconds
✓ Precision@10: 0.4670
✓ Coverage: 0.0773


### 7. SVD Matrix Factorization

In [8]:
start_time = time.time()

# Apply SVD
n_factors = 50
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_factors = svd.fit_transform(train_matrix)
item_factors = svd.components_.T

# Reconstruct rating matrix
predicted_ratings = np.dot(user_factors, item_factors.T)
predicted_ratings_df = pd.DataFrame(
    predicted_ratings,
    index=train_matrix.index,
    columns=train_matrix.columns
)

def svd_recommendations(user_id, n=10):
    """Generate recommendations using SVD"""
    if user_id not in predicted_ratings_df.index:
        return []
    
    # Get user's predictions
    user_predictions = predicted_ratings_df.loc[user_id]
    
    # Remove already rated items
    user_rated = set(train_data[train_data['user_id'] == user_id]['item_id'].values)
    user_predictions = user_predictions[~user_predictions.index.isin(user_rated)]
    
    # Return top N
    recommendations = user_predictions.sort_values(ascending=False).head(n)
    return recommendations.index.tolist()

# Generate predictions
svd_predictions = {}
for user_id in test_users:
    svd_predictions[user_id] = svd_recommendations(user_id)

training_time_svd = time.time() - start_time
metrics_svd = calculate_metrics(svd_predictions)

print(f"✓ Training time: {training_time_svd:.2f} seconds")
print(f"✓ Precision@10: {metrics_svd['precision@10']:.4f}")
print(f"✓ Coverage: {metrics_svd['coverage']:.4f}")
print(f"✓ Explained variance: {svd.explained_variance_ratio_.sum():.4f}")

✓ Training time: 0.25 seconds
✓ Precision@10: 0.4380
✓ Coverage: 0.2039
✓ Explained variance: 0.4645


### 8. Hybrid (Item Based + SVD)

In [9]:
start_time = time.time()

def hybrid_recommendations(user_id, n=10, weight_ib=0.5, weight_svd=0.5):
    """Combine item-based and SVD recommendations"""
    ib_recs = item_based_recommendations(user_id, n=20)
    svd_recs = svd_recommendations(user_id, n=20)
    
    # Combine scores
    item_scores = {}
    for i, item_id in enumerate(ib_recs):
        item_scores[item_id] = weight_ib * (20 - i)
    
    for i, item_id in enumerate(svd_recs):
        if item_id in item_scores:
            item_scores[item_id] += weight_svd * (20 - i)
        else:
            item_scores[item_id] = weight_svd * (20 - i)
    
    # Sort and return top N
    recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [item_id for item_id, score in recommendations]

# Generate predictions
hybrid_predictions = {}
for user_id in test_users:
    hybrid_predictions[user_id] = hybrid_recommendations(user_id)

training_time_hybrid = time.time() - start_time
metrics_hybrid = calculate_metrics(hybrid_predictions)

print(f"✓ Training time: {training_time_hybrid:.2f} seconds")
print(f"✓ Precision@10: {metrics_hybrid['precision@10']:.4f}")
print(f"✓ Coverage: {metrics_hybrid['coverage']:.4f}")

✓ Training time: 10.11 seconds
✓ Precision@10: 0.5000
✓ Coverage: 0.1427


### 9. Model Comparision

In [10]:
comparison_df = pd.DataFrame({
    'Model': ['User-Based CF', 'Item-Based CF', 'SVD (MF)', 'Hybrid'],
    'Precision@10': [
        metrics_ub['precision@10'],
        metrics_ib['precision@10'],
        metrics_svd['precision@10'],
        metrics_hybrid['precision@10']
    ],
    'Coverage': [
        metrics_ub['coverage'],
        metrics_ib['coverage'],
        metrics_svd['coverage'],
        metrics_hybrid['coverage']
    ],
    'Training Time (s)': [
        training_time_ub,
        training_time_ib,
        training_time_svd,
        training_time_hybrid
    ]
})

print(comparison_df.to_string(index=False))

# Select best model
best_model_idx = comparison_df['Precision@10'].idxmax()
best_model = comparison_df.iloc[best_model_idx]['Model']

print(f"\n🏆 BEST MODEL: {best_model}")
print(f"   Precision@10: {comparison_df.iloc[best_model_idx]['Precision@10']:.4f}")

        Model  Precision@10  Coverage  Training Time (s)
User-Based CF         0.476  0.120690          11.704220
Item-Based CF         0.467  0.077289          10.123723
     SVD (MF)         0.438  0.203924           0.252365
       Hybrid         0.500  0.142687          10.110859

🏆 BEST MODEL: Hybrid
   Precision@10: 0.5000


### 10. Save Model

In [11]:
models = {
    'item_similarity': item_similarity_df,
    'svd_model': svd,
    'user_factors': user_factors,
    'item_factors': item_factors,
    'train_matrix': train_matrix,
    'movies': movies,
    'ratings': ratings
}

with open('recommendation_models.pkl', 'wb') as f:
    pickle.dump(models, f)

# Save comparison results
comparison_df.to_csv('model_comparison.csv', index=False)

print("✓ Saved recommendation_models.pkl")
print("✓ Saved model_comparison.csv")

print("\n" + "=" * 80)
print("TRAINING COMPLETE!")
print("=" * 80)
print(f"\nBest performing model: {best_model}")
print("Models are ready for deployment in Streamlit app!")

✓ Saved recommendation_models.pkl
✓ Saved model_comparison.csv

TRAINING COMPLETE!

Best performing model: Hybrid
Models are ready for deployment in Streamlit app!
