# Lab B.1 Solutions: Collaborative Filtering Fundamentals

This notebook contains complete solutions to the exercises from Lab B.1.

---

In [None]:
# Setup
import sys
from pathlib import Path

module_dir = Path.cwd().parent if 'solutions' in str(Path.cwd()) else Path.cwd()
sys.path.insert(0, str(module_dir / 'scripts'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from data_utils import download_movielens, train_test_split_by_time, RatingsDataset

np.random.seed(42)
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load data
ratings_df, movies_df = download_movielens('100k')
train_df, test_df = train_test_split_by_time(ratings_df, test_ratio=0.2)

num_users = ratings_df['user_id'].nunique()
num_items = ratings_df['item_id'].nunique()

train_dataset = RatingsDataset(train_df)
test_dataset = RatingsDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

---

## Exercise 1 Solution: Hyperparameter Tuning

Compare different embedding dimensions.

In [None]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        nn.init.normal_(self.user_embeddings.weight, std=0.01)
        nn.init.normal_(self.item_embeddings.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
    def forward(self, user_ids, item_ids):
        user_emb = self.user_embeddings(user_ids)
        item_emb = self.item_embeddings(item_ids)
        interaction = (user_emb * item_emb).sum(dim=1)
        prediction = (
            interaction +
            self.user_bias(user_ids).squeeze() +
            self.item_bias(item_ids).squeeze() +
            self.global_bias
        )
        return prediction

In [None]:
def train_and_evaluate(embedding_dim, num_epochs=20):
    """Train MF model with given embedding dimension and return best RMSE."""
    model = MatrixFactorization(num_users, num_items, embedding_dim).to(device)
    model.global_bias.data = torch.tensor([train_df['rating'].mean()])
    
    optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=1e-5)
    criterion = nn.MSELoss()
    
    best_rmse = float('inf')
    
    for epoch in range(num_epochs):
        # Train
        model.train()
        for users, items, ratings in train_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            predictions = model(users, items)
            loss = criterion(predictions, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Evaluate
        model.eval()
        total_loss = 0
        with torch.no_grad():
            for users, items, ratings in test_loader:
                users, items, ratings = users.to(device), items.to(device), ratings.to(device)
                predictions = model(users, items)
                total_loss += criterion(predictions, ratings).item() * len(users)
        
        rmse = np.sqrt(total_loss / len(test_dataset))
        best_rmse = min(best_rmse, rmse)
    
    return best_rmse


# Test different embedding dimensions
embedding_dims = [16, 32, 64, 128, 256]
results = {}

for dim in embedding_dims:
    print(f"\nTesting embedding_dim = {dim}...")
    rmse = train_and_evaluate(dim, num_epochs=20)
    results[dim] = rmse
    print(f"  Best RMSE: {rmse:.4f}")

print("\n" + "="*40)
print("RESULTS SUMMARY")
print("="*40)
for dim, rmse in sorted(results.items()):
    print(f"  dim={dim:3d}: RMSE = {rmse:.4f}")

In [None]:
# Visualize results
plt.figure(figsize=(10, 5))
dims = list(results.keys())
rmses = list(results.values())

plt.plot(dims, rmses, 'bo-', markersize=10, linewidth=2)
plt.xlabel('Embedding Dimension')
plt.ylabel('Test RMSE')
plt.title('Effect of Embedding Dimension on RMSE')
plt.grid(True, alpha=0.3)
plt.xscale('log', base=2)

# Mark the best
best_dim = min(results, key=results.get)
plt.axvline(x=best_dim, color='red', linestyle='--', label=f'Best: dim={best_dim}')
plt.legend()
plt.show()

print(f"\nðŸ“Š Best embedding dimension: {best_dim} with RMSE = {results[best_dim]:.4f}")

---

## Exercise 2 Solution: Regularization Impact

Compare different weight decay values.

In [None]:
def train_with_regularization(weight_decay, num_epochs=20):
    """Train with specified regularization and return train/test metrics."""
    model = MatrixFactorization(num_users, num_items, embedding_dim=64).to(device)
    model.global_bias.data = torch.tensor([train_df['rating'].mean()])
    
    optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    
    train_losses = []
    test_rmses = []
    
    for epoch in range(num_epochs):
        # Train
        model.train()
        epoch_loss = 0
        for users, items, ratings in train_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            predictions = model(users, items)
            loss = criterion(predictions, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * len(users)
        
        train_losses.append(epoch_loss / len(train_dataset))
        
        # Evaluate
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for users, items, ratings in test_loader:
                users, items, ratings = users.to(device), items.to(device), ratings.to(device)
                predictions = model(users, items)
                test_loss += criterion(predictions, ratings).item() * len(users)
        
        test_rmses.append(np.sqrt(test_loss / len(test_dataset)))
    
    return train_losses, test_rmses


# Test different regularization strengths
weight_decays = [0, 1e-6, 1e-5, 1e-4, 1e-3]
all_results = {}

for wd in weight_decays:
    print(f"Training with weight_decay = {wd}...")
    train_losses, test_rmses = train_with_regularization(wd, num_epochs=30)
    all_results[wd] = {'train': train_losses, 'test': test_rmses}
    print(f"  Final RMSE: {test_rmses[-1]:.4f}")

In [None]:
# Visualize regularization effects
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = plt.cm.viridis(np.linspace(0, 1, len(weight_decays)))

for wd, color in zip(weight_decays, colors):
    label = f'wd={wd}' if wd > 0 else 'No reg'
    axes[0].plot(all_results[wd]['train'], color=color, label=label)
    axes[1].plot(all_results[wd]['test'], color=color, label=label)

axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Training Loss')
axes[0].set_title('Training Loss by Regularization')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Test RMSE')
axes[1].set_title('Test RMSE by Regularization')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ“Š Observations:")
print("   - No regularization: Lowest training loss but may overfit")
print("   - Moderate regularization (1e-5): Good balance")
print("   - Strong regularization (1e-3): Higher training loss, may underfit")

---

## Bonus: SVD++ Implementation

In [None]:
class SVDPlusPlus(nn.Module):
    """
    SVD++ incorporates implicit feedback: which items a user has rated.
    
    r_ui = global_bias + user_bias + item_bias + 
           q_i^T * (p_u + (1/sqrt(|N(u)|)) * sum(y_j for j in N(u)))
    
    where N(u) is the set of items user u has rated.
    """
    
    def __init__(self, num_users, num_items, embedding_dim=64):
        super().__init__()
        self.num_items = num_items
        
        # Standard MF components
        self.user_embeddings = nn.Embedding(num_users, embedding_dim)
        self.item_embeddings = nn.Embedding(num_items, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))
        
        # SVD++ addition: implicit factor vectors
        self.implicit_factors = nn.Embedding(num_items, embedding_dim)
        
        self._init_weights()
        
    def _init_weights(self):
        for emb in [self.user_embeddings, self.item_embeddings, self.implicit_factors]:
            nn.init.normal_(emb.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        
    def forward(self, user_ids, item_ids, user_rated_items_list):
        """
        Args:
            user_ids: (batch_size,) tensor
            item_ids: (batch_size,) tensor
            user_rated_items_list: List of lists, each containing item IDs rated by user
        """
        batch_size = len(user_ids)
        
        # Standard embeddings
        p_u = self.user_embeddings(user_ids)  # (batch, dim)
        q_i = self.item_embeddings(item_ids)  # (batch, dim)
        
        # Compute implicit contribution for each user
        implicit_sum = torch.zeros_like(p_u)
        
        for idx, rated_items in enumerate(user_rated_items_list):
            if len(rated_items) > 0:
                rated_tensor = torch.LongTensor(rated_items).to(user_ids.device)
                y_j = self.implicit_factors(rated_tensor)  # (num_rated, dim)
                implicit_sum[idx] = y_j.sum(dim=0) / np.sqrt(len(rated_items))
        
        # Enhanced user representation
        user_repr = p_u + implicit_sum
        
        # Interaction
        interaction = (user_repr * q_i).sum(dim=1)
        
        # Final prediction
        prediction = (
            self.global_bias +
            self.user_bias(user_ids).squeeze() +
            self.item_bias(item_ids).squeeze() +
            interaction
        )
        
        return prediction


print("âœ… SVD++ implemented!")
print("\nNote: SVD++ requires tracking which items each user has rated.")
print("This is more complex but typically improves RMSE by 1-3%.")

---

## Key Takeaways

1. **Embedding dimension**: 64-128 is typically optimal for MovieLens 100K. Larger dimensions may overfit.

2. **Regularization**: L2 regularization (weight_decay) helps prevent overfitting. 1e-5 is a good starting point.

3. **SVD++**: Incorporating implicit feedback (which items are rated) improves performance but adds complexity.

4. **Trade-offs**: There's always a balance between model complexity and generalization.