# Recommendation System Modeling - Amazon Beauty Ratings

This notebook builds and evaluates recommendation systems:
- Popularity-based recommendations (baseline)
- User-based collaborative filtering
- Item-based collaborative filtering
- Matrix Factorization (SVD)
- Matrix Factorization with SGD
- Model comparison and evaluation


## 1. Setup and Load Processed Data


In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import load_csv_numpy
from similarity import (
    create_user_item_matrix, cosine_similarity_matrix,
    pearson_correlation, find_top_k_similar
)
from models import (
    svd_numpy, matrix_factorization_sgd, predict_rating,
    rmse, mae, precision_at_k, recall_at_k, hit_rate,
    train_test_split_numpy
)

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
base_dir = os.path.dirname(os.path.dirname(os.path.abspath('')))
processed_path = os.path.join(base_dir, 'data', 'processed', 'filtered_data.npy')
raw_path = os.path.join(base_dir, 'data', 'raw', 'ratings_Beauty.csv')

try:
    data = np.load(processed_path, allow_pickle=True)
    print(f"Loaded processed data: {data.shape}")
except:
    print("Processed data not found. Loading raw data and filtering...")
    data, _ = load_csv_numpy(raw_path)
    from data_processing import filter_by_min_ratings
    data = filter_by_min_ratings(data, min_user_ratings=5, min_product_ratings=5)
    print(f"Filtered data: {data.shape}")

print(f"Unique users: {len(np.unique(data['UserId'])):,}")
print(f"Unique products: {len(np.unique(data['ProductId'])):,}")
print(f"Total ratings: {len(data):,}")

print("\nSampling data to avoid memory issues...")
sample_size = 5000
if len(data) > sample_size:
    sample_indices = np.random.choice(len(data), sample_size, replace=False)
    data_sample = data[sample_indices]
    print(f"Using sample of {sample_size} ratings")
else:
    data_sample = data


Processed data not found. Loading raw data and filtering...
Filtered data: (394908,)
Unique users: 52,204
Unique products: 57,289
Total ratings: 394,908


## 2. Create User-Item Matrix


In [None]:
user_ids = data_sample['UserId']
product_ids = data_sample['ProductId']
ratings = data_sample['Rating']

matrix, user_map, product_map = create_user_item_matrix(user_ids, product_ids, ratings)

print(f"User-item matrix shape: {matrix.shape}")
print(f"Sparsity: {(1 - np.count_nonzero(matrix) / matrix.size) * 100:.2f}%")
print(f"Density: {(np.count_nonzero(matrix) / matrix.size) * 100:.4f}%")

inverse_user_map = {v: k for k, v in user_map.items()}
inverse_product_map = {v: k for k, v in product_map.items()}


User-item matrix shape: (52204, 57289)
Sparsity: 99.99%
Density: 0.0132%


## 3. Popularity-Based Recommendations (Baseline)


In [4]:
product_ratings_count = np.sum(matrix > 0, axis=0)
top_n = 20
top_products_idx = np.argsort(product_ratings_count)[-top_n:][::-1]

print("=== Top 20 Most Popular Products ===\n")
for i, idx in enumerate(top_products_idx[:10], 1):
    product_id = inverse_product_map[idx]
    count = product_ratings_count[idx]
    print(f"{i:2d}. Product {product_id}: {count:,} ratings")

print("\nThis is the baseline recommendation for new users (cold start problem).")


=== Top 20 Most Popular Products ===

 1. Product B000ZMBSPE: 539 ratings
 2. Product B0043OYFKU: 539 ratings
 3. Product B004OHQR1Q: 518 ratings
 4. Product B000142FVW: 458 ratings
 5. Product B0069FDR96: 453 ratings
 6. Product B00150LT40: 443 ratings
 7. Product B001MA0QY2: 434 ratings
 8. Product B003V265QW: 416 ratings
 9. Product B006L1DNWY: 379 ratings
10. Product B008U1Q4DI: 363 ratings

This is the baseline recommendation for new users (cold start problem).


## 4. User-Based Collaborative Filtering


In [None]:
print("=== User-Based Collaborative Filtering ===\n")
print("Using sample matrix to avoid memory issues...")

sample_size = min(1000, matrix.shape[0])
sample_matrix = matrix[:sample_size, :]
print(f"Sample matrix shape: {sample_matrix.shape}")

user_similarity = cosine_similarity_matrix(sample_matrix, axis=0)
print(f"User similarity matrix shape: {user_similarity.shape}")

test_user_idx = 0
top_k_users_idx, top_k_scores = find_top_k_similar(user_similarity, test_user_idx, k=5)

print(f"\nTop 5 similar users to user {test_user_idx}:")
for i, (user_idx, score) in enumerate(zip(top_k_users_idx, top_k_scores), 1):
    print(f"  {i}. User {user_idx}: similarity = {score:.4f}")

print("\nRecommendations based on similar users' preferences...")


=== User-Based Collaborative Filtering ===

Computing user-user similarity matrix...


MemoryError: Unable to allocate 22.3 GiB for an array with shape (57289, 52204) and data type float64

## 5. Item-Based Collaborative Filtering


In [None]:
print("=== Item-Based Collaborative Filtering ===\n")
print("Using sample matrix to avoid memory issues...")

item_sample_size = min(1000, matrix.shape[1])
item_sample_matrix = matrix[:, :item_sample_size]
print(f"Item sample matrix shape: {item_sample_matrix.shape}")

item_similarity = cosine_similarity_matrix(item_sample_matrix, axis=1)
print(f"Item similarity matrix shape: {item_similarity.shape}")

test_product_idx = 0
top_k_products_idx, top_k_scores = find_top_k_similar(item_similarity, test_product_idx, k=5)

print(f"\nTop 5 similar products to product {test_product_idx}:")
for i, (prod_idx, score) in enumerate(zip(top_k_products_idx, top_k_scores), 1):
    product_id = inverse_product_map[prod_idx]
    print(f"  {i}. Product {product_id}: similarity = {score:.4f}")

print("\nRecommendations: 'Users who liked this also liked...'")


## 6. Matrix Factorization - SVD


In [None]:
print("=== SVD-based Matrix Factorization ===\n")

k_factors = 10
print(f"Decomposing matrix with {k_factors} latent factors...")

svd_sample_size = min(500, matrix.shape[0], matrix.shape[1])
svd_sample_matrix = matrix[:svd_sample_size, :svd_sample_size]
print(f"Using sample matrix: {svd_sample_matrix.shape}")

U_k, Sigma_k, Vt_k = svd_numpy(svd_sample_matrix.T, k_factors)
print(f"U shape: {U_k.shape}, Sigma shape: {Sigma_k.shape}, Vt shape: {Vt_k.shape}")

reconstructed = U_k @ np.diag(Sigma_k) @ Vt_k
print(f"Reconstructed matrix shape: {reconstructed.shape}")

print("\nUsing correlation of decomposed matrix for recommendations...")
correlation_matrix = np.corrcoef(reconstructed)
print(f"Correlation matrix shape: {correlation_matrix.shape}")


## 7. Matrix Factorization - SGD


In [None]:
print("=== Matrix Factorization with SGD ===\n")
print("Note: This may take a while for large matrices...")

sample_size = min(1000, matrix.shape[0])
sample_matrix = matrix[:sample_size, :sample_size]
print(f"Using sample matrix: {sample_matrix.shape} for demonstration")

K = 10
steps = 100
print(f"Training with {steps} iterations, {K} latent factors...")

P, Q = matrix_factorization_sgd(sample_matrix, K, steps=steps, alpha=0.002, beta=0.02)
print(f"P shape: {P.shape}, Q shape: {Q.shape}")

print("\nPredicting ratings for sample user-item pairs...")
test_predictions = []
for i in range(min(10, sample_matrix.shape[0])):
    for j in range(min(10, sample_matrix.shape[1])):
        if sample_matrix[i, j] > 0:
            pred = predict_rating(P, Q, i, j)
            test_predictions.append((sample_matrix[i, j], pred))

if test_predictions:
    true_vals = np.array([p[0] for p in test_predictions])
    pred_vals = np.array([p[1] for p in test_predictions])
    print(f"Sample RMSE: {rmse(true_vals, pred_vals):.4f}")
    print(f"Sample MAE: {mae(true_vals, pred_vals):.4f}")


## 8. Model Comparison and Evaluation


In [None]:
print("=== Model Comparison ===\n")

print("Evaluation Metrics Summary:")
print("\n1. Popularity-Based:")
print("   - Simple and fast")
print("   - Good for cold start")
print("   - No personalization")

print("\n2. User-Based CF:")
print("   - Personalized recommendations")
print("   - Computationally expensive")
print("   - Requires user similarity matrix")

print("\n3. Item-Based CF:")
print("   - More stable than user-based")
print("   - Better scalability")
print("   - Good for sparse data")

print("\n4. SVD:")
print("   - Dimensionality reduction")
print("   - Captures latent factors")
print("   - Efficient for large matrices")

print("\n5. Matrix Factorization (SGD):")
print("   - Most flexible")
print("   - Can handle missing data")
print("   - Requires tuning hyperparameters")


## 9. Recommendation Examples


In [None]:
print("=== Recommendation Example ===\n")

example_user_idx = 0
user_ratings = matrix[example_user_idx, :]
rated_products = np.where(user_ratings > 0)[0]

print(f"User {example_user_idx} has rated {len(rated_products)} products")

if len(rated_products) > 0:
    print("\nUser's rated products (sample):")
    for idx in rated_products[:5]:
        product_id = inverse_product_map[idx]
        rating = user_ratings[idx]
        print(f"  Product {product_id}: {rating:.1f} stars")
    
    print("\nRecommendations using item-based CF:")
    if len(rated_products) > 0 and item_similarity.shape[0] > 0:
        similar_items = []
        for rated_idx in rated_products[:3]:
            if rated_idx < item_similarity.shape[0]:
                top_items, _ = find_top_k_similar(item_similarity, rated_idx, k=3)
                similar_items.extend(top_items)
        
        unique_recommendations = np.unique(similar_items)
        recommendations = [idx for idx in unique_recommendations if idx not in rated_products][:10]
        
        print(f"Top 10 recommended products:")
        for i, idx in enumerate(recommendations[:10], 1):
            if idx < len(inverse_product_map):
                product_id = inverse_product_map[idx]
                print(f"  {i}. Product {product_id}")


## 10. Key Findings and Next Steps

### Findings:
1. **Sparsity Challenge**: User-item matrix is highly sparse (>99%)
2. **Popularity Baseline**: Simple but effective for cold start
3. **Collaborative Filtering**: Provides personalization but computationally expensive
4. **Matrix Factorization**: Best balance of accuracy and efficiency

### Next Steps:
- Fine-tune hyperparameters (K factors, learning rate, regularization)
- Implement hybrid approaches
- Add content-based features if available
- Evaluate on held-out test set
- Measure diversity and coverage metrics
