RECOMMENDATION SYSTEM PROJECT TEMPLATE
======================================
Use Case: Movie Recommendations, Product Recommendations, Content Recommendations

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from scipy.sparse.linalg import svds
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
# Set random seeds
np.random.seed(42)
torch.manual_seed(42)

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

## 1.2 Configuration

In [None]:
CONFIG = {
    'data_path': 'ratings.csv',
    'items_path': 'items.csv',
    'test_size': 0.2,
    'min_ratings': 5,  # Minimum ratings per user/item
    'latent_factors': 50,
    'batch_size': 256,
    'learning_rate': 0.001,
    'num_epochs': 20,
    'top_k': 10,  # Number of recommendations
    'embedding_dim': 32,
    'random_seed': 42
}

# 2. DATA LOADING & EXPLORATION

## 2.1 Load Data

In [None]:
# Load ratings data (user_id, item_id, rating, timestamp)
ratings_df = pd.read_csv(CONFIG['data_path'])

# Load item metadata (item_id, title, genre, etc.)
items_df = pd.read_csv(CONFIG['items_path'])

print(f"Ratings shape: {ratings_df.shape}")
print(f"Items shape: {items_df.shape}")

ratings_df.head()

## 2.2 Exploratory Data Analysis

In [None]:
# Basic statistics
print("\n=== RATINGS STATISTICS ===")
print(f"Number of users: {ratings_df['user_id'].nunique()}")
print(f"Number of items: {ratings_df['item_id'].nunique()}")
print(f"Number of ratings: {len(ratings_df)}")
print(f"Rating range: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"Average rating: {ratings_df['rating'].mean():.2f}")

# Sparsity
n_users = ratings_df['user_id'].nunique()
n_items = ratings_df['item_id'].nunique()
sparsity = 1 - (len(ratings_df) / (n_users * n_items))
print(f"Data sparsity: {sparsity:.4f} ({sparsity*100:.2f}%)")

In [None]:
# Rating distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Rating value distribution
axes[0].hist(ratings_df['rating'], bins=20, edgecolor='black')
axes[0].set_title('Rating Distribution')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Count')

# Ratings per user
user_counts = ratings_df.groupby('user_id').size()
axes[1].hist(user_counts, bins=50, edgecolor='black')
axes[1].set_title('Ratings per User')
axes[1].set_xlabel('Number of Ratings')
axes[1].set_ylabel('Number of Users')
axes[1].set_yscale('log')

# Ratings per item
item_counts = ratings_df.groupby('item_id').size()
axes[2].hist(item_counts, bins=50, edgecolor='black')
axes[2].set_title('Ratings per Item')
axes[2].set_xlabel('Number of Ratings')
axes[2].set_ylabel('Number of Items')
axes[2].set_yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Top rated items
top_items = ratings_df.groupby('item_id').agg({
    'rating': ['count', 'mean']
}).reset_index()
top_items.columns = ['item_id', 'count', 'avg_rating']
top_items = top_items[top_items['count'] >= 50].sort_values('avg_rating', ascending=False).head(10)

# Merge with item metadata
top_items = top_items.merge(items_df, on='item_id')
print("\nTop 10 Rated Items:")
print(top_items[['title', 'avg_rating', 'count']])

# 3. DATA PREPROCESSING

## 3.1 Filter Low Activity Users/Items

In [None]:
# Count ratings per user and item
user_activity = ratings_df.groupby('user_id').size()
item_activity = ratings_df.groupby('item_id').size()

# Filter
active_users = user_activity[user_activity >= CONFIG['min_ratings']].index
active_items = item_activity[item_activity >= CONFIG['min_ratings']].index

ratings_filtered = ratings_df[
    (ratings_df['user_id'].isin(active_users)) & 
    (ratings_df['item_id'].isin(active_items))
]

print(f"Original ratings: {len(ratings_df)}")
print(f"Filtered ratings: {len(ratings_filtered)}")
print(f"Reduction: {(1 - len(ratings_filtered)/len(ratings_df))*100:.2f}%")

## 3.2 Create User-Item Matrix

In [None]:
# Create mapping dictionaries
user_ids = ratings_filtered['user_id'].unique()
item_ids = ratings_filtered['item_id'].unique()

user_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_to_idx = {item_id: idx for idx, item_id in enumerate(item_ids)}

idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}
idx_to_item = {idx: item_id for item_id, idx in item_to_idx.items()}

# Map to indices
ratings_filtered['user_idx'] = ratings_filtered['user_id'].map(user_to_idx)
ratings_filtered['item_idx'] = ratings_filtered['item_id'].map(item_to_idx)

# Create sparse matrix
n_users = len(user_ids)
n_items = len(item_ids)

user_item_matrix = sparse.csr_matrix(
    (ratings_filtered['rating'].values,
     (ratings_filtered['user_idx'].values, ratings_filtered['item_idx'].values)),
    shape=(n_users, n_items)
)

print(f"User-Item Matrix shape: {user_item_matrix.shape}")

# 4. TRAIN-TEST SPLIT

In [None]:
train_data, test_data = train_test_split(
    ratings_filtered, 
    test_size=CONFIG['test_size'], 
    random_state=CONFIG['random_seed']
)

print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")

# 5. COLLABORATIVE FILTERING - MATRIX FACTORIZATION

## 5.1 SVD-based Matrix Factorization

In [None]:
def matrix_factorization_svd(user_item_matrix, k=50):
    """Perform SVD on user-item matrix"""
    # Subtract mean rating
    mean_rating = user_item_matrix.data.mean()
    user_item_normalized = user_item_matrix.copy()
    user_item_normalized.data -= mean_rating
    
    # Perform SVD
    U, sigma, Vt = svds(user_item_normalized, k=k)
    
    # Reconstruct matrix
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt) + mean_rating
    
    return predicted_ratings, U, sigma, Vt, mean_rating

# Perform SVD
predicted_ratings, U, sigma, Vt, mean_rating = matrix_factorization_svd(
    user_item_matrix, 
    k=CONFIG['latent_factors']
)

print(f"User factors shape: {U.shape}")
print(f"Item factors shape: {Vt.shape}")

## 5.2 Neural Collaborative Filtering

In [None]:
class NCFDataset(Dataset):
    """Dataset for Neural Collaborative Filtering"""
    def __init__(self, ratings_df):
        self.users = ratings_df['user_idx'].values
        self.items = ratings_df['item_idx'].values
        self.ratings = ratings_df['rating'].values
    
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return (
            torch.LongTensor([self.users[idx]]),
            torch.LongTensor([self.items[idx]]),
            torch.FloatTensor([self.ratings[idx]])
        )

In [None]:
class NeuralCF(nn.Module):
    """Neural Collaborative Filtering Model"""
    def __init__(self, num_users, num_items, embedding_dim=32, layers=[64, 32, 16, 8]):
        super(NeuralCF, self).__init__()
        
        # Embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # MLP layers
        self.fc_layers = nn.ModuleList()
        input_dim = embedding_dim * 2
        
        for layer_size in layers:
            self.fc_layers.append(nn.Linear(input_dim, layer_size))
            input_dim = layer_size
        
        self.output_layer = nn.Linear(layers[-1], 1)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        nn.init.normal_(self.user_embedding.weight, std=0.01)
        nn.init.normal_(self.item_embedding.weight, std=0.01)
    
    def forward(self, user_idx, item_idx):
        # Get embeddings
        user_emb = self.user_embedding(user_idx).squeeze(1)
        item_emb = self.item_embedding(item_idx).squeeze(1)
        
        # Concatenate
        x = torch.cat([user_emb, item_emb], dim=1)
        
        # MLP layers with ReLU
        for fc in self.fc_layers:
            x = F.relu(fc(x))
        
        # Output
        output = self.output_layer(x)
        
        return output

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NeuralCF(
    num_users=n_users,
    num_items=n_items,
    embedding_dim=CONFIG['embedding_dim']
).to(device)

print(model)

# 6. MODEL TRAINING

In [None]:
# Create dataloaders
train_dataset = NCFDataset(train_data)
test_dataset = NCFDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG['learning_rate'])

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for user, item, rating in dataloader:
        user = user.to(device)
        item = item.to(device)
        rating = rating.to(device)
        
        # Forward
        predictions = model(user, item)
        loss = criterion(predictions, rating)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    predictions_list = []
    actuals_list = []
    
    with torch.no_grad():
        for user, item, rating in dataloader:
            user = user.to(device)
            item = item.to(device)
            rating = rating.to(device)
            
            predictions = model(user, item)
            loss = criterion(predictions, rating)
            
            total_loss += loss.item()
            predictions_list.extend(predictions.cpu().numpy().flatten())
            actuals_list.extend(rating.cpu().numpy().flatten())
    
    rmse = np.sqrt(mean_squared_error(actuals_list, predictions_list))
    mae = mean_absolute_error(actuals_list, predictions_list)
    
    return total_loss / len(dataloader), rmse, mae

In [None]:
# Training loop
history = {'train_loss': [], 'test_loss': [], 'test_rmse': [], 'test_mae': []}

for epoch in range(CONFIG['num_epochs']):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    test_loss, test_rmse, test_mae = evaluate(model, test_loader, criterion, device)
    
    history['train_loss'].append(train_loss)
    history['test_loss'].append(test_loss)
    history['test_rmse'].append(test_rmse)
    history['test_mae'].append(test_mae)
    
    print(f"Epoch [{epoch+1}/{CONFIG['num_epochs']}] "
          f"Train Loss: {train_loss:.4f}, Test RMSE: {test_rmse:.4f}, Test MAE: {test_mae:.4f}")

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(history['train_loss'], label='Train Loss')
axes[0].plot(history['test_loss'], label='Test Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training History')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(history['test_rmse'], label='RMSE')
axes[1].plot(history['test_mae'], label='MAE')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Error')
axes[1].set_title('Test Metrics')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# 7. CONTENT-BASED FILTERING

## 7.1 Item Similarity Matrix

In [None]:
# Create item feature matrix (e.g., genres, tags)
# Assuming items_df has genre columns

# Example: one-hot encode genres
# item_features = pd.get_dummies(items_df['genre'])

# Calculate cosine similarity
# item_similarity = cosine_similarity(item_features)

## 7.2 TF-IDF for Text Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# If items have descriptions
# tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
# tfidf_matrix = tfidf.fit_transform(items_df['description'])
# item_similarity = cosine_similarity(tfidf_matrix)

# 8. GENERATING RECOMMENDATIONS

## 8.1 Top-K Recommendations

In [None]:
def get_top_n_recommendations(model, user_idx, item_indices, n=10, device='cpu'):
    """Get top N recommendations for a user"""
    model.eval()
    
    user_tensor = torch.LongTensor([user_idx] * len(item_indices)).to(device)
    item_tensor = torch.LongTensor(item_indices).to(device)
    
    with torch.no_grad():
        predictions = model(user_tensor.unsqueeze(1), item_tensor.unsqueeze(1))
    
    predictions = predictions.squeeze().cpu().numpy()
    
    # Get top N
    top_indices = np.argsort(predictions)[-n:][::-1]
    top_items = [item_indices[i] for i in top_indices]
    top_scores = predictions[top_indices]
    
    return top_items, top_scores

# Example: Get recommendations for user 0
user_idx = 0
all_items = list(range(n_items))

# Get items user hasn't rated
user_rated_items = set(ratings_filtered[ratings_filtered['user_idx'] == user_idx]['item_idx'].values)
unrated_items = [item for item in all_items if item not in user_rated_items]

# Get recommendations
recommended_items, scores = get_top_n_recommendations(
    model, user_idx, unrated_items, n=CONFIG['top_k'], device=device
)

print(f"\nTop {CONFIG['top_k']} Recommendations for User {idx_to_user[user_idx]}:")
for i, (item_idx, score) in enumerate(zip(recommended_items, scores), 1):
    item_id = idx_to_item[item_idx]
    item_info = items_df[items_df['item_id'] == item_id].iloc[0]
    print(f"{i}. {item_info['title']} (Score: {score:.2f})")

## 8.2 Similar Items (Content-Based)

In [None]:
def get_similar_items(item_id, item_similarity_matrix, items_df, top_n=10):
    """Find similar items based on content"""
    item_idx = item_to_idx[item_id]
    
    # Get similarity scores
    similarity_scores = list(enumerate(item_similarity_matrix[item_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N (excluding itself)
    similar_items = similarity_scores[1:top_n+1]
    
    results = []
    for idx, score in similar_items:
        item_id_similar = idx_to_item[idx]
        item_info = items_df[items_df['item_id'] == item_id_similar].iloc[0]
        results.append({
            'title': item_info['title'],
            'similarity': score
        })
    
    return pd.DataFrame(results)

# 9. EVALUATION METRICS

## 9.1 Precision@K and Recall@K

In [None]:
def precision_recall_at_k(predictions, actual, k=10):
    """Calculate Precision and Recall at K"""
    precisions = []
    recalls = []
    
    for user_preds, user_actual in zip(predictions, actual):
        # Get top K predictions
        top_k = set(user_preds[:k])
        relevant = set(user_actual)
        
        if len(relevant) > 0:
            precision = len(top_k & relevant) / k
            recall = len(top_k & relevant) / len(relevant)
            
            precisions.append(precision)
            recalls.append(recall)
    
    return np.mean(precisions), np.mean(recalls)

## 9.2 Mean Average Precision (MAP)

In [None]:
def mean_average_precision(predictions, actual):
    """Calculate Mean Average Precision"""
    aps = []
    
    for user_preds, user_actual in zip(predictions, actual):
        relevant = set(user_actual)
        
        if len(relevant) == 0:
            continue
        
        score = 0.0
        num_hits = 0.0
        
        for i, pred in enumerate(user_preds, 1):
            if pred in relevant:
                num_hits += 1.0
                score += num_hits / i
        
        if len(relevant) > 0:
            aps.append(score / len(relevant))
    
    return np.mean(aps)

## 9.3 Normalized Discounted Cumulative Gain (NDCG)

In [None]:
def ndcg_at_k(predictions, actual, k=10):
    """Calculate NDCG@K"""
    def dcg_at_k(relevances, k):
        relevances = np.asfarray(relevances)[:k]
        if relevances.size:
            return np.sum(relevances / np.log2(np.arange(2, relevances.size + 2)))
        return 0.0
    
    ndcgs = []
    
    for user_preds, user_actual in zip(predictions, actual):
        relevances = [1 if pred in user_actual else 0 for pred in user_preds[:k]]
        
        dcg = dcg_at_k(relevances, k)
        idcg = dcg_at_k(sorted(relevances, reverse=True), k)
        
        if idcg > 0:
            ndcgs.append(dcg / idcg)
    
    return np.mean(ndcgs)

# 10. HYBRID RECOMMENDER SYSTEM

In [None]:
def hybrid_recommendations(user_idx, model, item_similarity, alpha=0.5, top_n=10):
    """
    Combine collaborative filtering and content-based filtering
    
    alpha: weight for collaborative filtering (1-alpha for content-based)
    """
    # Collaborative filtering scores
    cf_scores = []
    # Content-based scores
    cb_scores = []
    
    # Combine scores
    hybrid_scores = alpha * np.array(cf_scores) + (1 - alpha) * np.array(cb_scores)
    
    # Get top N
    top_indices = np.argsort(hybrid_scores)[-top_n:][::-1]
    
    return top_indices, hybrid_scores[top_indices]

# 11. MODEL PERSISTENCE

In [None]:
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'user_to_idx': user_to_idx,
    'item_to_idx': item_to_idx,
    'config': CONFIG
}, 'recommendation_model.pth')

# Save SVD components
np.savez('svd_components.npz', U=U, sigma=sigma, Vt=Vt, mean_rating=mean_rating)

# 12. DEPLOYMENT PIPELINE

In [None]:
class RecommenderAPI:
    """Production-ready recommendation API"""
    
    def __init__(self, model_path, svd_path):
        # Load models
        checkpoint = torch.load(model_path)
        self.model = NeuralCF(n_users, n_items, CONFIG['embedding_dim'])
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.eval()
        
        self.user_to_idx = checkpoint['user_to_idx']
        self.item_to_idx = checkpoint['item_to_idx']
        
        # Load SVD components
        svd_data = np.load(svd_path)
        self.U = svd_data['U']
        self.sigma = svd_data['sigma']
        self.Vt = svd_data['Vt']
    
    def get_recommendations(self, user_id, n=10):
        """Get top N recommendations for a user"""
        if user_id not in self.user_to_idx:
            return self._handle_cold_start(user_id, n)
        
        user_idx = self.user_to_idx[user_id]
        # Generate recommendations
        # ...
        
        return recommended_items
    
    def _handle_cold_start(self, user_id, n):
        """Handle new users (cold start problem)"""
        # Return popular items or content-based recommendations
        return popular_items[:n]

# 13. CONCLUSIONS & NEXT STEPS

## Summary:
- Model: Neural Collaborative Filtering
- Test RMSE: X.XX
- Test MAE: X.XX
- Precision@10: X.XX

## Next Steps:
- [ ] Implement Deep & Cross Network (DCN)
- [ ] Add contextual information (time, location)
- [ ] Implement multi-armed bandit for exploration
- [ ] A/B testing framework
- [ ] Real-time recommendation updates
- [ ] Implement diversity and serendipity metrics