In [1]:
import pandas as pd
import numpy as np
import pickle
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')


### 1. Load Data

In [2]:
ratings = pd.read_csv('data/processed/processed_ratings.csv')
movies = pd.read_csv('data/processed/processed_movies.csv')

print(f"‚úì Loaded {len(ratings):,} ratings")
print(f"‚úì Loaded {len(movies):,} movies")


‚úì Loaded 100,000 ratings
‚úì Loaded 1,682 movies


### 2. User Item Matrix

In [3]:
user_item_matrix = ratings.pivot_table(
    index='user_id',
    columns='item_id',
    values='rating',
    fill_value=0
)

print(f"‚úì Matrix shape: {user_item_matrix.shape}")
print(f"‚úì Sparsity: {(1 - (ratings.shape[0] / (user_item_matrix.shape[0] * user_item_matrix.shape[1])))*100:.2f}%")

‚úì Matrix shape: (943, 1682)
‚úì Sparsity: 93.70%


### 3. Train-Test Split

In [4]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(ratings, test_size=0.2, stratify=ratings['user_id'], random_state=42)
print(f"‚úì Training set: {len(train_data):,} ratings")
print(f"‚úì Test set: {len(test_data):,} ratings")

# Create train matrix
train_matrix = train_data.pivot_table(
    index='user_id',
    columns='item_id',
    values='rating',
    fill_value=0
)

‚úì Training set: 80,000 ratings
‚úì Test set: 20,000 ratings


### 4. Evaluation 

In [5]:
import numpy as np
import pandas as pd
import math

def calculate_metrics(predictions, test_data, movies_df, k=10):

    precisions = []
    recalls = []
    ndcgs = []
    
    actual_user_items = test_data.groupby('user_id')['item_id'].apply(set).to_dict()
    
    for user_id, actual_items in actual_user_items.items():
        if user_id in predictions and len(actual_items) > 0:
            
            pred_list = predictions[user_id][:k]
            
            if not pred_list:
                continue
                
            hits = [1 if item in actual_items else 0 for item in pred_list]
            num_hits = sum(hits)
            
            precisions.append(num_hits / k)
            
            recalls.append(num_hits / len(actual_items))
            
            dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(hits)])
            
            # Calculate IDCG: Ideal DCG (if all hits were at the very top)
            possible_hits = min(len(actual_items), k)
            idcg = sum([1.0 / np.log2(idx + 2) for idx in range(possible_hits)])
            
            if idcg > 0:
                ndcgs.append(dcg / idcg)
            else:
                ndcgs.append(0)

    # Count unique items recommended across all users
    all_recommended_items = set()
    for recs in predictions.values():
        all_recommended_items.update(recs[:k])
    
    total_movies = movies_df['movie_id'].nunique()
    coverage = len(all_recommended_items) / total_movies if total_movies > 0 else 0

    return {
        f'Precision@{k}': np.mean(precisions) if precisions else 0.0,
        f'Recall@{k}': np.mean(recalls) if recalls else 0.0,
        f'NDCG@{k}': np.mean(ndcgs) if ndcgs else 0.0,
        'Coverage': coverage,
        'Avg_Recs_Per_User': np.mean([len(v) for v in predictions.values()])
    }

### 5. User Based Collaborative Filtering

In [6]:
start_time = time.time()

# Calculate user similarity
user_similarity = cosine_similarity(train_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity,
    index=train_matrix.index,
    columns=train_matrix.index
)

def user_based_recommendations(user_id, n=10, n_neighbors=20):
    if user_id not in user_similarity_df.index:
        return []
    
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_neighbors+1]
    
    # Get items the user hasn't rated
    user_items = set(train_data[train_data['user_id'] == user_id]['item_id'].values)
    
    # Score items based on similar users' ratings
    item_scores = {}
    for sim_user, similarity in similar_users.items():
        sim_user_items = train_data[train_data['user_id'] == sim_user]
        for _, row in sim_user_items.iterrows():
            if row['item_id'] not in user_items:
                if row['item_id'] not in item_scores:
                    item_scores[row['item_id']] = 0
                item_scores[row['item_id']] += similarity * row['rating']
    
    # Sort and return top N
    recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [item_id for item_id, score in recommendations]

# Generate predictions for test users
user_based_predictions = {}
test_users = test_data['user_id'].unique()  
for user_id in test_users:
    user_based_predictions[user_id] = user_based_recommendations(user_id)

training_time_ub = time.time() - start_time
metrics_ub = calculate_metrics(user_based_predictions, test_data, movies)

print(f"‚úì Training time: {training_time_ub:.2f} seconds")
print(f"‚úì Precision@10:  {metrics_ub['Precision@10']:.4f}")
print(f"‚úì Recall@10:     {metrics_ub['Recall@10']:.4f}")
print(f"‚úì NDCG@10:       {metrics_ub['NDCG@10']:.4f}")
print(f"‚úì Coverage:      {metrics_ub['Coverage']:.4f}")

‚úì Training time: 335.36 seconds
‚úì Precision@10:  0.3203
‚úì Recall@10:     0.2095
‚úì NDCG@10:       0.3837
‚úì Coverage:      0.2004


### 6. Item Based Collaborative Filtering

In [7]:
start_time = time.time()

# Calculate item similarity
item_similarity = cosine_similarity(train_matrix.T)
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=train_matrix.columns,
    columns=train_matrix.columns
)

def item_based_recommendations(user_id, n=10):
    # Get items the user has rated
    user_items = train_data[train_data['user_id'] == user_id]
    
    if len(user_items) == 0:
        return []
    
    # Score items based on similarity to user's rated items
    item_scores = {}
    for _, row in user_items.iterrows():
        item_id = row['item_id']
        if item_id in item_similarity_df.index:
            similar_items = item_similarity_df[item_id].sort_values(ascending=False)[1:51]
            
            for sim_item, similarity in similar_items.items():
                if sim_item not in user_items['item_id'].values:
                    if sim_item not in item_scores:
                        item_scores[sim_item] = 0
                    item_scores[sim_item] += similarity * row['rating']
    
    # Sort and return top N
    recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [item_id for item_id, score in recommendations]

# Generate predictions
item_based_predictions = {}
for user_id in test_users:
    item_based_predictions[user_id] = item_based_recommendations(user_id)

training_time_ib = time.time() - start_time
metrics_ib = calculate_metrics(item_based_predictions, test_data, movies)

print(f"‚úì Training time: {training_time_ib:.2f} seconds")
print(f"‚úì Precision@10:  {metrics_ib['Precision@10']:.4f}")
print(f"‚úì Recall@10:     {metrics_ib['Recall@10']:.4f}")
print(f"‚úì NDCG@10:       {metrics_ib['NDCG@10']:.4f}")
print(f"‚úì Coverage:      {metrics_ib['Coverage']:.4f}")


‚úì Training time: 87.58 seconds
‚úì Precision@10:  0.2908
‚úì Recall@10:     0.1871
‚úì NDCG@10:       0.3514
‚úì Coverage:      0.1153


### 7. SVD Matrix Factorization

In [8]:
start_time = time.time()

# Apply SVD
n_factors = 50
svd = TruncatedSVD(n_components=n_factors, random_state=42)
user_factors = svd.fit_transform(train_matrix)
item_factors = svd.components_.T

# Reconstruct rating matrix
predicted_ratings = np.dot(user_factors, item_factors.T)
predicted_ratings_df = pd.DataFrame(
    predicted_ratings,
    index=train_matrix.index,
    columns=train_matrix.columns
)

def svd_recommendations(user_id, n=10):
    if user_id not in predicted_ratings_df.index:
        return []
    
    # Get user's predictions
    user_predictions = predicted_ratings_df.loc[user_id]
    
    # Remove already rated items
    user_rated = set(train_data[train_data['user_id'] == user_id]['item_id'].values)
    user_predictions = user_predictions[~user_predictions.index.isin(user_rated)]
    
    # Return top N
    recommendations = user_predictions.sort_values(ascending=False).head(n)
    return recommendations.index.tolist()

# Generate predictions
svd_predictions = {}
for user_id in test_users:
    svd_predictions[user_id] = svd_recommendations(user_id)

training_time_svd = time.time() - start_time
metrics_svd = calculate_metrics(svd_predictions, test_data, movies)

print(f"‚úì Training time: {training_time_svd:.2f} seconds")
print(f"‚úì Precision@10:  {metrics_svd['Precision@10']:.4f}")
print(f"‚úì Recall@10:     {metrics_svd['Recall@10']:.4f}")
print(f"‚úì NDCG@10:       {metrics_svd['NDCG@10']:.4f}")
print(f"‚úì Coverage:      {metrics_svd['Coverage']:.4f}")
print(f"‚úì Explained variance: {svd.explained_variance_ratio_.sum():.4f}")

‚úì Training time: 1.13 seconds
‚úì Precision@10:  0.3060
‚úì Recall@10:     0.2078
‚úì NDCG@10:       0.3694
‚úì Coverage:      0.2705
‚úì Explained variance: 0.4639


### 8. Hybrid (Item Based + SVD)

In [9]:
start_time = time.time()

def hybrid_recommendations(user_id, n=10, weight_ib=0.5, weight_svd=0.5):
    ib_recs = item_based_recommendations(user_id, n=20)
    svd_recs = svd_recommendations(user_id, n=20)
    
    # Combine scores
    item_scores = {}
    for i, item_id in enumerate(ib_recs):
        item_scores[item_id] = weight_ib * (20 - i)
    
    for i, item_id in enumerate(svd_recs):
        if item_id in item_scores:
            item_scores[item_id] += weight_svd * (20 - i)
        else:
            item_scores[item_id] = weight_svd * (20 - i)
    
    # Sort and return top N
    recommendations = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    return [item_id for item_id, score in recommendations]

# Generate predictions
hybrid_predictions = {}
for user_id in test_users:
    hybrid_predictions[user_id] = hybrid_recommendations(user_id)

training_time_hybrid = time.time() - start_time
metrics_hybrid = calculate_metrics(hybrid_predictions, test_data, movies)

print(f"‚úì Training time: {training_time_hybrid:.2f} seconds")
print(f"‚úì Precision@10:  {metrics_hybrid['Precision@10']:.4f}")
print(f"‚úì Recall@10:     {metrics_hybrid['Recall@10']:.4f}")
print(f"‚úì NDCG@10:       {metrics_hybrid['NDCG@10']:.4f}")
print(f"‚úì Coverage:      {metrics_hybrid['Coverage']:.4f}")

‚úì Training time: 102.03 seconds
‚úì Precision@10:  0.3284
‚úì Recall@10:     0.2167
‚úì NDCG@10:       0.3908
‚úì Coverage:      0.2122


### 9. Model Comparision

In [10]:
# Create comparison dataframe with ALL metrics
comparison_df = pd.DataFrame({
    'Model': ['User-Based CF', 'Item-Based CF', 'SVD (MF)', 'Hybrid'],
    
    'Precision@10': [
        metrics_ub['Precision@10'],
        metrics_ib['Precision@10'],
        metrics_svd['Precision@10'],
        metrics_hybrid['Precision@10']
    ],
    
    'Recall@10': [
        metrics_ub['Recall@10'],
        metrics_ib['Recall@10'],
        metrics_svd['Recall@10'],
        metrics_hybrid['Recall@10']
    ],
    
    'NDCG@10': [
        metrics_ub['NDCG@10'],
        metrics_ib['NDCG@10'],
        metrics_svd['NDCG@10'],
        metrics_hybrid['NDCG@10']
    ],
    
    'Coverage': [
        metrics_ub['Coverage'],
        metrics_ib['Coverage'],
        metrics_svd['Coverage'],
        metrics_hybrid['Coverage']
    ],
    
    'Training Time (s)': [
        training_time_ub,
        training_time_ib,
        training_time_svd,
        training_time_hybrid
    ]
})

# formatting for cleaner output
print(comparison_df.to_string(index=False))

# Select best model (You can also change this to use 'NDCG@10' for better ranking quality)
best_model_metric = 'Precision@10'  # or 'NDCG@10'
best_model_idx = comparison_df[best_model_metric].idxmax()
best_model = comparison_df.iloc[best_model_idx]['Model']

print(f"\nüèÜ BEST MODEL: {best_model}")
print(f"   {best_model_metric}: {comparison_df.iloc[best_model_idx][best_model_metric]:.4f}")
print(f"   Recall@10:    {comparison_df.iloc[best_model_idx]['Recall@10']:.4f}")
print(f"   NDCG@10:      {comparison_df.iloc[best_model_idx]['NDCG@10']:.4f}")

        Model  Precision@10  Recall@10  NDCG@10  Coverage  Training Time (s)
User-Based CF      0.320255   0.209530 0.383700  0.200357         335.359707
Item-Based CF      0.290774   0.187137 0.351412  0.115339          87.578813
     SVD (MF)      0.306045   0.207758 0.369437  0.270511           1.132744
       Hybrid      0.328420   0.216749 0.390792  0.212247         102.026265

üèÜ BEST MODEL: Hybrid
   Precision@10: 0.3284
   Recall@10:    0.2167
   NDCG@10:      0.3908


### 10. Save Model

In [11]:
models = {
    'item_similarity': item_similarity_df,
    'svd_model': svd,
    'user_factors': user_factors,
    'item_factors': item_factors,
    'train_matrix': train_matrix,
    'movies': movies,
    'ratings': ratings
}

with open('recommendation_models.pkl', 'wb') as f:
    pickle.dump(models, f)

# Save comparison results
comparison_df.to_csv('model_comparison.csv', index=False)

print("‚úì Saved recommendation_models.pkl")
print("‚úì Saved model_comparison.csv")

print("\n" + "=" * 80)
print("TRAINING COMPLETE!")
print("=" * 80)
print(f"\nBest performing model: {best_model}")
print("Models are ready for deployment in Streamlit app!")

‚úì Saved recommendation_models.pkl
‚úì Saved model_comparison.csv

TRAINING COMPLETE!

Best performing model: Hybrid
Models are ready for deployment in Streamlit app!
