# Evaluation & Benchmarking

Evaluate recommendation quality:
- Precision, Recall, NDCG, Hit Rate
- Personalization metrics
- Serving latency

In [None]:
import os
import time
import json
import pickle
import random
import math
from pathlib import Path
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import redis
from redis.cluster import RedisCluster, ClusterNode

def get_redis():
    nodes = [
        ClusterNode("redis-node-1", 6379),
        ClusterNode("redis-node-2", 6379),
        ClusterNode("redis-node-3", 6379),
    ]
    try:
        r = RedisCluster(startup_nodes=nodes, decode_responses=True)
        r.ping()
        return r
    except:
        r = redis.Redis(host="redis-node-1", port=6379, decode_responses=True)
        r.ping()
        return r

redis_client = get_redis()
print("Connected to Redis")

## Load Ground Truth

In [None]:
DATA_DIR = Path("../data/processed")
RESULTS_DIR = Path("../results")
RESULTS_DIR.mkdir(exist_ok=True)

# load ground truth
with open(DATA_DIR / "ground_truth.pkl", 'rb') as f:
    ground_truth = pickle.load(f)

# load stats
with open(DATA_DIR / "dataset_stats.pkl", 'rb') as f:
    stats = pickle.load(f)

print(f"Ground truth users: {len(ground_truth):,}")
print(f"Dataset stats: {stats}")

In [None]:
# check redis data
rec_keys = list(redis_client.scan_iter(match="user:recs:*", count=1000))
history_keys = list(redis_client.scan_iter(match="user:history:*", count=1000))
cooc_keys = list(redis_client.scan_iter(match="item:cooc:*", count=1000))

print(f"Recommendations: {len(rec_keys):,}")
print(f"User histories: {len(history_keys):,}")
print(f"Co-occurrence: {len(cooc_keys):,}")

## Metrics Functions

In [None]:
def precision_at_k(recommended, relevant, k=10):
    """Fraction of recommendations that are relevant."""
    recs = set(recommended[:k])
    hits = len(recs & relevant)
    return hits / k

def recall_at_k(recommended, relevant, k=10):
    """Fraction of relevant items that were recommended."""
    recs = set(recommended[:k])
    hits = len(recs & relevant)
    return hits / len(relevant) if relevant else 0

def ndcg_at_k(recommended, relevant, k=10):
    """Normalized discounted cumulative gain."""
    dcg = sum(1.0 / np.log2(i + 2) for i, item in enumerate(recommended[:k]) if item in relevant)
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(k, len(relevant))))
    return dcg / idcg if idcg > 0 else 0

def hit_rate(recommended, relevant, k=10):
    """Whether any recommendation is relevant."""
    return 1 if set(recommended[:k]) & relevant else 0

def mrr(recommended, relevant):
    """Mean reciprocal rank."""
    for i, item in enumerate(recommended):
        if item in relevant:
            return 1.0 / (i + 1)
    return 0

In [None]:
def get_recommendations(redis_client, user_id, k=10):
    """Get top-k recommendations for user."""
    recs = redis_client.zrevrange(f"user:recs:{user_id}", 0, k-1)
    return [int(r) for r in recs]

def jaccard_similarity(set1, set2):
    """Jaccard similarity between two sets."""
    if not set1 or not set2:
        return 0
    return len(set1 & set2) / len(set1 | set2)

## Evaluate Quality

In [None]:
def evaluate_quality(redis_client, ground_truth, k=10, sample_size=5000):
    """Evaluate recommendation quality metrics."""
    
    # sample users
    users = list(ground_truth.keys())
    if len(users) > sample_size:
        users = random.sample(users, sample_size)
    
    metrics = {
        'precision': [],
        'recall': [],
        'ndcg': [],
        'hit_rate': [],
        'mrr': []
    }
    
    evaluated = 0
    no_recs = 0
    
    for uid in users:
        recs = get_recommendations(redis_client, uid, k)
        if not recs:
            no_recs += 1
            continue
        
        relevant = ground_truth[uid]
        
        metrics['precision'].append(precision_at_k(recs, relevant, k))
        metrics['recall'].append(recall_at_k(recs, relevant, k))
        metrics['ndcg'].append(ndcg_at_k(recs, relevant, k))
        metrics['hit_rate'].append(hit_rate(recs, relevant, k))
        metrics['mrr'].append(mrr(recs, relevant))
        evaluated += 1
    
    results = {
        'k': k,
        'users_evaluated': evaluated,
        'users_no_recs': no_recs,
        'coverage': evaluated / len(users) if users else 0
    }
    
    for name, values in metrics.items():
        if values:
            results[f'{name}@{k}'] = np.mean(values)
            results[f'{name}@{k}_std'] = np.std(values)
        else:
            results[f'{name}@{k}'] = 0
    
    return results

In [None]:
K = 10
SAMPLE_SIZE = min(5000, len(ground_truth))

quality = evaluate_quality(redis_client, ground_truth, k=K, sample_size=SAMPLE_SIZE)

print(f"QUALITY METRICS (K={K})")
print(f"\nUsers evaluated: {quality['users_evaluated']:,}")
print(f"\nUsers w/o recs: {quality['users_no_recs']:,}")
print(f"\nCoverage: {quality['coverage']:.1%}")
print(f"\nPrecision@{K}: {quality[f'precision@{K}']:.4f}")
print(f"\nRecall@{K}: {quality[f'recall@{K}']:.4f}")
print(f"\nNDCG@{K}: {quality[f'ndcg@{K}']:.4f}")
print(f"\nHit Rate@{K}: {quality[f'hit_rate@{K}']:.4f}")
print(f"\nMRR: {quality[f'mrr@{K}']:.4f}")

## Evaluate Personalization

In [None]:
def evaluate_personalization(redis_client, user_ids, k=10, num_pairs=5000):
    """Measure how different recommendations are across users."""
    
    # get recommendations for sample of users
    user_recs = {}
    for uid in user_ids[:5000]:
        recs = get_recommendations(redis_client, uid, k)
        if recs:
            user_recs[uid] = set(recs)
    
    if len(user_recs) < 2:
        return {'inter_user_similarity': 0, 'personalization': 1}
    
    # compute pairwise similarities
    users = list(user_recs.keys())
    similarities = []
    
    for _ in range(min(num_pairs, len(users) * (len(users) - 1) // 2)):
        u1, u2 = random.sample(users, 2)
        sim = jaccard_similarity(user_recs[u1], user_recs[u2])
        similarities.append(sim)
    
    mean_sim = np.mean(similarities)
    return {
        'inter_user_similarity': mean_sim,
        'personalization': 1 - mean_sim,
        'users_sampled': len(user_recs),
        'pairs_compared': len(similarities)
    }

In [None]:
# get users with recommendations
users_with_recs = [int(k.split(':')[-1]) for k in rec_keys]

personalization = evaluate_personalization(redis_client, users_with_recs, k=K)

print("PERSONALIZATION METRICS")
print(f"\nUsers sampled: {personalization['users_sampled']:,}")
print(f"\nPairs compared: {personalization['pairs_compared']:,}")
print(f"\nInter-user similarity: {personalization['inter_user_similarity']:.4f}")
print(f"\nPersonalization score: {personalization['personalization']:.4f}")

# target: inter-user similarity <= 0.30
target = 0.30
status = "PASS" if personalization['inter_user_similarity'] <= target else "FAIL"
print(f"\nTarget (<={target}): {status}")

## Latency Benchmark

In [None]:
def benchmark_latency(redis_client, user_ids, k=10, iterations=1000):
    """Benchmark recommendation serving latency."""
    
    latencies = []
    sample_users = random.choices(user_ids, k=iterations)
    
    # warmup
    for uid in sample_users[:100]:
        get_recommendations(redis_client, uid, k)
    
    # benchmark
    start_total = time.time()
    for uid in sample_users:
        start = time.perf_counter()
        get_recommendations(redis_client, uid, k)
        latencies.append((time.perf_counter() - start) * 1000)  # ms
    elapsed = time.time() - start_total
    
    return {
        'iterations': iterations,
        'throughput': iterations / elapsed,
        'latency_mean': np.mean(latencies),
        'latency_median': np.median(latencies),
        'latency_p95': np.percentile(latencies, 95),
        'latency_p99': np.percentile(latencies, 99),
        'latency_min': min(latencies),
        'latency_max': max(latencies)
    }

In [None]:
latency = benchmark_latency(redis_client, users_with_recs, k=K, iterations=1000)

print(f"Iterations: {latency['iterations']:,}")
print(f"Throughput: {latency['throughput']:.0f} req/sec")
print(f"\nLatency (ms):")
print(f"  Mean: {latency['latency_mean']:.3f}")
print(f"  Median: {latency['latency_median']:.3f}")
print(f"  P95: {latency['latency_p95']:.3f}")
print(f"  P99: {latency['latency_p99']:.3f}")
print(f"  Min: {latency['latency_min']:.3f}")
print(f"  Max: {latency['latency_max']:.3f}")

## Visualization

In [None]:
# Figure 1: Quality Metrics Bar Chart
fig, ax = plt.subplots(figsize=(8, 5))

metrics = ['Precision', 'Recall', 'NDCG', 'Hit Rate']
values = [quality[f'precision@{K}'], quality[f'recall@{K}'], 
          quality[f'ndcg@{K}'], quality[f'hit_rate@{K}']]
colors = ['#2ecc71', '#3498db', '#9b59b6', '#e74c3c']

bars = ax.bar(metrics, values, color=colors, edgecolor='black', linewidth=1.2)
ax.set_ylabel('Score', fontsize=12)
ax.set_title(f'Recommendation Quality Metrics (K={K})', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(values) * 1.2)

for bar, v in zip(bars, values):
    ax.text(bar.get_x() + bar.get_width()/2, v + 0.01, f'{v:.3f}', 
            ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figure1_quality_metrics.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {RESULTS_DIR / 'figure1_quality_metrics.png'}")

In [None]:
# Figure 2: Latency Under Load
from concurrent.futures import ThreadPoolExecutor, as_completed

def measure_latency_at_load(redis_client, user_ids, num_requests, num_threads):
    """Measure latency with concurrent requests."""
    latencies = []
    sample_users = random.choices(user_ids, k=num_requests)
    
    def fetch_rec(uid):
        t0 = time.perf_counter()
        get_recommendations(redis_client, uid, 10)
        return (time.perf_counter() - t0) * 1000
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(fetch_rec, uid) for uid in sample_users]
        for f in as_completed(futures):
            latencies.append(f.result())
    
    return np.mean(latencies), np.percentile(latencies, 95)

# Test at different concurrency levels
concurrent_users = [1, 5, 10, 20, 50]
mean_latencies = []
p95_latencies = []

print("Measuring latency under load...")
for n in concurrent_users:
    mean_lat, p95_lat = measure_latency_at_load(redis_client, users_with_recs, 300, n)
    mean_latencies.append(mean_lat)
    p95_latencies.append(p95_lat)
    print(f"  {n} concurrent users: {mean_lat:.2f}ms mean, {p95_lat:.2f}ms p95")

# Simple bar chart
fig, ax = plt.subplots(figsize=(8, 5))

x = np.arange(len(concurrent_users))
width = 0.35

bars1 = ax.bar(x - width/2, mean_latencies, width, label='Mean', color='#3498db', edgecolor='black')
bars2 = ax.bar(x + width/2, p95_latencies, width, label='P95', color='#e74c3c', edgecolor='black')

ax.set_xlabel('Concurrent Users', fontsize=12)
ax.set_ylabel('Latency (ms)', fontsize=12)
ax.set_title('Response Latency Under Load', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(concurrent_users)
ax.legend()

for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
            f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=9)
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
            f'{bar.get_height():.1f}', ha='center', va='bottom', fontsize=9)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figure2_latency_throughput.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {RESULTS_DIR / 'figure2_latency_throughput.png'}")

In [None]:
# Figure 3: Precision@K and Recall@K Curves
k_values = [1, 3, 5, 10, 15, 20]
precision_scores = []
recall_scores = []

users = list(ground_truth.keys())
sample_users = random.sample(users, min(3000, len(users)))

print("Evaluating at different K values...")
for k in k_values:
    prec_list = []
    rec_list = []
    
    for uid in sample_users:
        recs = get_recommendations(redis_client, uid, 20)  # get max needed
        if not recs:
            continue
        relevant = ground_truth[uid]
        prec_list.append(precision_at_k(recs, relevant, k))
        rec_list.append(recall_at_k(recs, relevant, k))
    
    precision_scores.append(np.mean(prec_list) if prec_list else 0)
    recall_scores.append(np.mean(rec_list) if rec_list else 0)
    print(f"  K={k}: Precision={precision_scores[-1]:.4f}, Recall={recall_scores[-1]:.4f}")

fig, ax = plt.subplots(figsize=(8, 5))

ax.plot(k_values, precision_scores, 'o-', color='#2ecc71', linewidth=2.5, 
        markersize=10, label='Precision@K')
ax.plot(k_values, recall_scores, 's-', color='#3498db', linewidth=2.5, 
        markersize=10, label='Recall@K')

ax.set_xlabel('K (Number of Recommendations)', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Precision and Recall at Different K Values', fontsize=14, fontweight='bold')
ax.set_xticks(k_values)
ax.legend(loc='center right')
ax.grid(True, alpha=0.3)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'figure3_precision_recall_curves.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: {RESULTS_DIR / 'figure3_precision_recall_curves.png'}")