In [None]:
import pandas as pd
import json
import numpy as np
from sklearn.metrics import ndcg_score
from scipy.stats import t

# Load your original test set
test_df = pd.read_csv('subset_ratings.csv')
test_df = test_df[test_df.movieId.notna()]

# Load predictions
with open("predictions/cbf_top10_subset.json") as f:
    preds_cbf = json.load(f)

with open("predictions/hybrid_top10_subset.json") as f:
    preds_hybrid = json.load(f)

# Precision and Recall Calculation
def precision_recall_at_k(pred_dict, test_df, K=10):
    hit_count, rec_count, rel_count = 0, 0, 0

    for u, grp in test_df.groupby('userId'):
        u = str(u)
        if u not in pred_dict:
            continue

        true_items = set(grp.movieId)
        pred_items = pred_dict[u][:K]

        hits = len(true_items.intersection(pred_items))
        hit_count += hits
        rec_count += K
        rel_count += len(true_items)

    precision = hit_count / rec_count if rec_count else 0
    recall = hit_count / rel_count if rel_count else 0
    return precision, recall

# NDCG Calculation
def ndcg_at_k(pred_dict, test_df, K=10):
    scores, truths = [], []
    for u, grp in test_df.groupby('userId'):
        u = str(u)
        if u not in pred_dict:
            continue

        true_items = set(grp.movieId)
        pred_items = pred_dict[u][:K]
        rel = [1 if m in true_items else 0 for m in pred_items]
        scores.append(rel)
        truths.append(sorted(rel, reverse=True))

    return ndcg_score(truths, scores)

# Calculate metrics and confidence intervals
def metrics_with_ci(pred_dict, test_df, K=10, confidence=0.95):
    precision_samples, recall_samples, ndcg_samples = [], [], []
    user_ids = test_df.userId.unique()

    for user in user_ids:
        user_df = test_df[test_df.userId == user]
        p, r = precision_recall_at_k({str(user): pred_dict.get(str(user), [])}, user_df, K)
        precision_samples.append(p)
        recall_samples.append(r)
        ndcg_samples.append(ndcg_at_k({str(user): pred_dict.get(str(user), [])}, user_df, K))

    def calc_ci(samples):
        mean = np.mean(samples)
        sem = np.std(samples, ddof=1) / np.sqrt(len(samples))
        margin = sem * t.ppf((1 + confidence) / 2., len(samples) - 1)
        return mean, mean - margin, mean + margin

    precision_mean, precision_ci_low, precision_ci_high = calc_ci(precision_samples)
    recall_mean, recall_ci_low, recall_ci_high = calc_ci(recall_samples)
    ndcg_mean, ndcg_ci_low, ndcg_ci_high = calc_ci(ndcg_samples)

    return {
        'precision': (precision_mean, precision_ci_low, precision_ci_high),
        'recall': (recall_mean, recall_ci_low, recall_ci_high),
        'ndcg': (ndcg_mean, ndcg_ci_low, ndcg_ci_high)
    }

# Evaluate both models
cbf_results = metrics_with_ci(preds_cbf, test_df, K=10)
hybrid_results = metrics_with_ci(preds_hybrid, test_df, K=10)

# Display results
print("Model | Metric | Mean | CI-Lower | CI-Upper")
print("-"*50)
for model, result in zip(['CBF', 'Hybrid'], [cbf_results, hybrid_results]):
    for metric, (mean, ci_low, ci_high) in result.items():
        print(f"{model:<6} | {metric:<9} | {mean:.4f} | {ci_low:.4f} | {ci_high:.4f}")



ValueError: Found array with 0 feature(s) (shape=(1, 0)) while a minimum of 1 is required.