# Recommendation System Training & Evaluation

This notebook demonstrates how to train and evaluate the ALS + TF-IDF hybrid recommendation system.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
import joblib
import json

from app.db import db
from app.config import settings

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load and Explore Data

In [None]:
# Load orders
orders_df = db.get_all_orders()
print(f"Total orders: {len(orders_df)}")
print(f"Unique users: {orders_df['user_id'].nunique()}")
print(f"Unique products: {orders_df['product_id'].nunique()}")

orders_df.head()

In [None]:
# Load products
products_df = db.get_products()
print(f"Total products: {len(products_df)}")

products_df.head()

## 2. Data Analysis

In [None]:
# Order distribution per user
user_order_counts = orders_df.groupby('user_id').size()

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(user_order_counts, bins=30, edgecolor='black')
plt.xlabel('Number of Orders')
plt.ylabel('Number of Users')
plt.title('Distribution of Orders per User')

plt.subplot(1, 2, 2)
product_order_counts = orders_df.groupby('product_id').size()
plt.hist(product_order_counts, bins=30, edgecolor='black')
plt.xlabel('Number of Orders')
plt.ylabel('Number of Products')
plt.title('Distribution of Orders per Product')

plt.tight_layout()
plt.show()

print(f"Average orders per user: {user_order_counts.mean():.2f}")
print(f"Average orders per product: {product_order_counts.mean():.2f}")

In [None]:
# Product categories
plt.figure(figsize=(12, 6))
products_df['category'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Product Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 3. Build User-Item Matrix

In [None]:
# Convert quantity to implicit feedback
orders_df['weight'] = orders_df['quantity'].apply(lambda x: min(x, 10))

# Aggregate by user-product pairs
interactions = orders_df.groupby(['user_id', 'product_id'])['weight'].sum().reset_index()

print(f"Total interactions: {len(interactions)}")
interactions.head()

In [None]:
# Create mappings
user_ids = sorted(interactions['user_id'].unique())
item_ids = sorted(interactions['product_id'].unique())

user_index = {uid: idx for idx, uid in enumerate(user_ids)}
item_index = {pid: idx for idx, pid in enumerate(item_ids)}

print(f"Users: {len(user_ids)}")
print(f"Items: {len(item_ids)}")

In [None]:
# Build sparse matrix
rows = interactions['user_id'].map(user_index).values
cols = interactions['product_id'].map(item_index).values
data = interactions['weight'].values

user_item_matrix = csr_matrix(
    (data, (rows, cols)),
    shape=(len(user_ids), len(item_ids))
)

sparsity = 1 - (user_item_matrix.nnz / (user_item_matrix.shape[0] * user_item_matrix.shape[1]))

print(f"Matrix shape: {user_item_matrix.shape}")
print(f"Non-zero entries: {user_item_matrix.nnz}")
print(f"Sparsity: {sparsity:.4f}")

In [None]:
# Visualize matrix sparsity
plt.figure(figsize=(10, 8))
plt.spy(user_item_matrix[:100, :100], markersize=1)
plt.title('User-Item Matrix Sparsity (first 100x100)')
plt.xlabel('Items')
plt.ylabel('Users')
plt.show()

## 4. Train ALS Model

In [None]:
# Train ALS model
als_model = AlternatingLeastSquares(
    factors=64,
    regularization=0.01,
    iterations=20,
    calculate_training_loss=True,
    random_state=42
)

print("Training ALS model...")
als_model.fit(user_item_matrix, show_progress=True)
print("✓ Training complete")

In [None]:
# Get sample recommendations
sample_user_id = user_ids[0]
sample_user_idx = user_index[sample_user_id]

print(f"Sample recommendations for user {sample_user_id}:")

ids, scores = als_model.recommend(
    sample_user_idx,
    user_item_matrix[sample_user_idx],
    N=10,
    filter_already_liked_items=True
)

index_to_item = {idx: iid for iid, idx in item_index.items()}

for idx, score in zip(ids, scores):
    product_id = index_to_item[idx]
    print(f"  Product {product_id}: score {score:.4f}")

## 5. Train TF-IDF Model

In [None]:
# Create text features
products_df['text'] = (
    products_df['title'].fillna('') + ' ' + 
    products_df['description'].fillna('') + ' ' + 
    products_df['category'].fillna('')
)

# Train TF-IDF
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    min_df=1,
    max_df=0.95
)

tfidf_matrix = tfidf.fit_transform(products_df['text'].values)

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")

In [None]:
# Test content-based recommendations
sample_product_idx = 0
sample_product = products_df.iloc[sample_product_idx]

print(f"Similar products to: {sample_product['title']}\n")

similarities = cosine_similarity(tfidf_matrix[sample_product_idx:sample_product_idx+1], tfidf_matrix).flatten()
similar_indices = similarities.argsort()[::-1][1:11]  # Top 10, excluding itself

for idx in similar_indices:
    print(f"  {products_df.iloc[idx]['title']}: {similarities[idx]:.4f}")

## 6. Evaluate Hybrid Approach

In [None]:
def get_cf_recommendations(user_id, top_k=10):
    """Get collaborative filtering recommendations."""
    if user_id not in user_index:
        return []
    
    user_idx = user_index[user_id]
    ids, scores = als_model.recommend(
        user_idx,
        user_item_matrix[user_idx],
        N=top_k,
        filter_already_liked_items=True
    )
    
    return [(index_to_item[idx], float(score)) for idx, score in zip(ids, scores)]

def get_cb_recommendations(user_id, top_k=10):
    """Get content-based recommendations."""
    # Get user's purchased products
    user_products = interactions[interactions['user_id'] == user_id]['product_id'].tolist()
    
    if not user_products:
        return []
    
    # Get product indices
    product_indices = []
    for prod_id in user_products:
        idx_list = products_df.index[products_df['id'] == prod_id].tolist()
        if idx_list:
            product_indices.append(idx_list[0])
    
    if not product_indices:
        return []
    
    # Create user profile
    user_profile = tfidf_matrix[product_indices].mean(axis=0)
    
    # Calculate similarities
    similarities = cosine_similarity(user_profile, tfidf_matrix).flatten()
    
    # Get top N (excluding already purchased)
    recommendations = []
    for idx in similarities.argsort()[::-1]:
        product_id = products_df.iloc[idx]['id']
        if product_id not in user_products:
            recommendations.append((product_id, float(similarities[idx])))
            if len(recommendations) >= top_k:
                break
    
    return recommendations

def get_hybrid_recommendations(user_id, top_k=10, cf_weight=0.7, cb_weight=0.3):
    """Get hybrid recommendations."""
    cf_recs = get_cf_recommendations(user_id, top_k * 2)
    cb_recs = get_cb_recommendations(user_id, top_k * 2)
    
    # Combine scores
    combined = {}
    
    for prod_id, score in cf_recs:
        combined[prod_id] = score * cf_weight
    
    for prod_id, score in cb_recs:
        if prod_id in combined:
            combined[prod_id] += score * cb_weight
        else:
            combined[prod_id] = score * cb_weight
    
    # Sort and return top K
    sorted_recs = sorted(combined.items(), key=lambda x: x[1], reverse=True)
    return sorted_recs[:top_k]

In [None]:
# Compare methods for a sample user
test_user = user_ids[0]

print(f"Recommendations for user {test_user}:\n")

print("Collaborative Filtering:")
cf_recs = get_cf_recommendations(test_user, 5)
for prod_id, score in cf_recs:
    print(f"  {prod_id}: {score:.4f}")

print("\nContent-Based:")
cb_recs = get_cb_recommendations(test_user, 5)
for prod_id, score in cb_recs:
    print(f"  {prod_id}: {score:.4f}")

print("\nHybrid (0.7 CF + 0.3 CB):")
hybrid_recs = get_hybrid_recommendations(test_user, 5)
for prod_id, score in hybrid_recs:
    print(f"  {prod_id}: {score:.4f}")

## 7. Analyze Recommendations

In [None]:
# Coverage analysis
all_recommended = set()

for user_id in user_ids[:50]:  # Sample 50 users
    recs = get_hybrid_recommendations(user_id, 10)
    all_recommended.update([prod_id for prod_id, _ in recs])

coverage = len(all_recommended) / len(item_ids)
print(f"Catalog coverage: {coverage:.2%}")
print(f"Unique products recommended: {len(all_recommended)} out of {len(item_ids)}")

In [None]:
# Score distribution
all_scores = []

for user_id in user_ids[:50]:
    recs = get_hybrid_recommendations(user_id, 10)
    all_scores.extend([score for _, score in recs])

plt.figure(figsize=(10, 6))
plt.hist(all_scores, bins=30, edgecolor='black')
plt.xlabel('Recommendation Score')
plt.ylabel('Frequency')
plt.title('Distribution of Recommendation Scores')
plt.show()

print(f"Mean score: {np.mean(all_scores):.4f}")
print(f"Median score: {np.median(all_scores):.4f}")

## 8. Save Models

In [None]:
# Save ALS model
als_path = settings.model_dir / "als_model.joblib"
joblib.dump(als_model, als_path)
print(f"✓ ALS model saved to {als_path}")

# Save TF-IDF model
tfidf_path = settings.model_dir / "tfidf.joblib"
joblib.dump({
    'vectorizer': tfidf,
    'matrix': tfidf_matrix,
    'products': products_df[['id', 'title', 'description', 'category', 'text']]
}, tfidf_path)
print(f"✓ TF-IDF model saved to {tfidf_path}")

# Save mappings
mappings_path = settings.model_dir / "mappings.json"
mappings = {
    'user_index': user_index,
    'item_index': item_index,
    'user_ids': user_ids,
    'item_ids': item_ids,
    'index_to_user': {str(idx): uid for uid, idx in user_index.items()},
    'index_to_item': {str(idx): iid for iid, idx in item_index.items()}
}

with open(mappings_path, 'w') as f:
    json.dump(mappings, f, indent=2)
print(f"✓ Mappings saved to {mappings_path}")

## 9. Test API

Now you can start the ML service and test the API:

```bash
# In terminal
python -m app.main

# In another terminal
curl "http://localhost:8000/recommendations/user/USER_ID?top_k=10"
```

In [None]:
# Test with requests (if service is running)
import requests

try:
    response = requests.get(f'http://localhost:8000/recommendations/user/{test_user}?top_k=5')
    if response.status_code == 200:
        data = response.json()
        print("API Response:")
        print(json.dumps(data, indent=2))
    else:
        print(f"API returned status {response.status_code}")
except requests.exceptions.ConnectionError:
    print("ML service not running. Start it with: python -m app.main")

## Summary

This notebook demonstrated:
1. Loading and exploring order/product data
2. Building user-item interaction matrix
3. Training ALS collaborative filtering model
4. Training TF-IDF content-based model
5. Implementing hybrid recommendations
6. Analyzing recommendation quality
7. Saving models for production use

Next steps:
- Tune hyperparameters (factors, regularization, weights)
- Implement evaluation metrics (precision@K, recall@K, NDCG)
- A/B test different approaches
- Monitor performance in production