# Functional Testing

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load files
path_inter = "user_video_interaction.xlsx"
path_top50 = "user_top50_trainer.xlsx"
path_itemcf = "itemcf_user_reco_top50.xlsx"

df_inter = pd.read_excel(path_inter)
df_top50 = pd.read_excel(path_top50)
df_itemcf = pd.read_excel(path_itemcf)

# 1) Matrix construction integrity
pivot = df_top50.pivot_table(index="user_id", columns="trainer_id", values="total_score", fill_value=0)
null_total = int(pivot.isnull().sum().sum())
num_users, num_trainers = pivot.shape

# 2) Cosine similarity logic (item-item)
sim = cosine_similarity(pivot.T)  # trainers x trainers
diag_is_one = bool(np.allclose(np.diag(sim), 1.0, atol=1e-8))
symmetry_ok = bool(np.allclose(sim, sim.T, atol=1e-8))

# sample a few entries to show
sim_df_sample = pd.DataFrame(sim[:10, :10], 
                             index=pivot.columns[:10], 
                             columns=pivot.columns[:10]).round(4)
display(sim_df_sample.reset_index().rename(columns={"index":"trainer_id"}))

# 3) Recommendation completeness (50 per user)
rec_counts = df_itemcf.groupby("user_id")["recommended_trainer_id"].count()
rec_completeness_ratio = float((rec_counts == 50).mean())
rec_counts_summary = rec_counts.describe().to_frame("count")
display(rec_counts.reset_index().rename(columns={"recommended_trainer_id":"reco_count"}))

# 4) Fault-case diagnostics
# Build full user list from top50 candidates (or union with interactions/users present)
users_all = pd.Index(df_top50["user_id"].unique()).union(df_itemcf["user_id"].unique())

# interactions per user (reindex to all users, missing->0)
user_actions = df_inter.groupby("user_id")["action_type"].count().reindex(users_all, fill_value=0)
cold_start_users = int((user_actions == 0).sum())
sparse_users = int((user_actions < 3).sum())

# outliers in watch_rate and rating_score (outside [0,5])
watch_outliers = int(df_inter["watch_rate"].dropna().pipe(lambda s: ((s < 0) | (s > 5)).sum()))
rating_outliers = int(df_inter["rating_score"].dropna().pipe(lambda s: ((s < 0) | (s > 5)).sum()))

# Prepare summary
summary = {
    "matrix_null_total": null_total,
    "matrix_shape": {"users": num_users, "trainers": num_trainers},
    "cosine_diag_is_one": diag_is_one,
    "cosine_symmetry_ok": symmetry_ok,
    "reco_50_per_user_ratio": rec_completeness_ratio,
    "num_cold_start_users": cold_start_users,
    "num_sparse_users(<3 actions)": sparse_users,
    "watch_rate_outliers": watch_outliers,
    "rating_score_outliers": rating_outliers
}

summary


trainer_id,trainer_id.1,1,2,3,4,5,6,7,8,9,10
0,1,1.0,0.0133,0.0582,0.0271,0.0068,0.0267,0.0154,0.0238,0.0521,0.0292
1,2,0.0133,1.0,0.0,0.0223,0.038,0.0286,0.0017,0.0086,0.0581,0.0067
2,3,0.0582,0.0,1.0,0.0759,0.0808,0.017,0.0017,0.0116,0.0066,0.0029
3,4,0.0271,0.0223,0.0759,1.0,0.0455,0.0355,0.082,0.0233,0.0046,0.0259
4,5,0.0068,0.038,0.0808,0.0455,1.0,0.0736,0.0029,0.0232,0.0489,0.0727
5,6,0.0267,0.0286,0.017,0.0355,0.0736,1.0,0.0344,0.0176,0.0504,0.0182
6,7,0.0154,0.0017,0.0017,0.082,0.0029,0.0344,1.0,0.0055,0.0125,0.0119
7,8,0.0238,0.0086,0.0116,0.0233,0.0232,0.0176,0.0055,1.0,0.0193,0.0236
8,9,0.0521,0.0581,0.0066,0.0046,0.0489,0.0504,0.0125,0.0193,1.0,0.0266
9,10,0.0292,0.0067,0.0029,0.0259,0.0727,0.0182,0.0119,0.0236,0.0266,1.0


Unnamed: 0,user_id,reco_count
0,U0000,50
1,U0001,50
2,U0002,50
3,U0003,50
4,U0004,50
...,...,...
995,U0995,50
996,U0996,50
997,U0997,50
998,U0998,50


{'matrix_null_total': 0,
 'matrix_shape': {'users': 1000, 'trainers': 1000},
 'cosine_diag_is_one': True,
 'cosine_symmetry_ok': True,
 'reco_50_per_user_ratio': 1.0,
 'num_cold_start_users': 0,
 'num_sparse_users(<3 actions)': 0,
 'watch_rate_outliers': 0,
 'rating_score_outliers': 0}

# Behavioural Testing

In [2]:
# Behavioural Testing for Item-based CF
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# --------------------
# Load data
# --------------------
path_inter = "user_video_interaction.xlsx"
path_top50 = "user_top50_trainer.xlsx"
path_itemcf = "itemcf_user_reco_top50.xlsx"

df_inter = pd.read_excel(path_inter)
df_top50 = pd.read_excel(path_top50)
df_itemcf = pd.read_excel(path_itemcf)

# Trainer specialties (for diversity) - infer from interactions if present; otherwise from a placeholder map
# In this dataset, specialties are not in these three tables, so we approximate diversity using co-occurrence across users.
# However, we can still compute list-wise diversity using item co-sim dissimilarity (1 - cosine).
# Build user-item matrix from top50 for this purpose:
pivot = df_top50.pivot_table(index="user_id", columns="trainer_id", values="total_score", fill_value=0).astype(float)

# Item-item cosine similarity
sim = cosine_similarity(pivot.T)  # trainers x trainers
trainer_ids = pivot.columns.to_list()
sim_df = pd.DataFrame(sim, index=trainer_ids, columns=trainer_ids)

# --------------------
# Helpers
# --------------------
rng = np.random.default_rng(2025)

def user_seeds_from_top50(g, k=5):
    """Top-k liked trainers by total_score > 0 from the user's top50 list."""
    g_pos = g[g["total_score"] > 0].sort_values("total_score", ascending=False)
    return g_pos["trainer_id"].head(k).tolist()

def max_sim_to_seeds(rec_ids, seed_ids):
    if not seed_ids:
        return np.zeros(len(rec_ids))
    # cap to existing ids
    seed_ids = [t for t in seed_ids if t in sim_df.index]
    rec_ids = [t for t in rec_ids if t in sim_df.index]
    if not seed_ids or not rec_ids:
        return np.zeros(len(rec_ids))
    sims = np.array([sim_df.loc[seed_ids, rid].max() for rid in rec_ids])
    return sims

def hit_ratio_at_50(user_id, rec_ids):
    base = set(df_top50.loc[df_top50["user_id"]==user_id, "trainer_id"].tolist())
    rec = set(rec_ids)
    if len(rec)==0:
        return 0.0
    return len(base & rec)/len(rec)

def item_coverage(rec_df, total_items):
    unique_items = rec_df["recommended_trainer_id"].nunique()
    return unique_items/total_items

def gini_coefficient(counts):
    # counts: array of item frequencies
    x = np.sort(counts.astype(np.float64))
    n = x.size
    if n == 0:
        return np.nan
    cumx = np.cumsum(x)
    gini = (n + 1 - 2*(cumx.sum()/cumx[-1])) if cumx[-1] > 0 else 0.0
    return gini / n

# Popularity from interactions
item_pop = df_inter.groupby("trainer_id").size().reindex(trainer_ids).fillna(0).astype(int)
pop_plus = item_pop + 1

# --------------------
# 2.1 结果合理性检验
# --------------------
# For each user, compute seed set, then compute similarity of each recommended item to seeds
group_top50 = df_top50.groupby("user_id")
group_rec = df_itemcf.groupby("user_id")

users = sorted(set(df_itemcf["user_id"].unique()) & set(df_top50["user_id"].unique()))
records = []
hit_list = []
for uid in users:
    seeds = user_seeds_from_top50(group_top50.get_group(uid), k=5)
    rec_ids = group_rec.get_group(uid)["recommended_trainer_id"].tolist()
    sims = max_sim_to_seeds(rec_ids, seeds)
    # thresholds
    prop_sim_ge_03 = float((sims >= 0.3).mean()) if sims.size else 0.0
    prop_sim_ge_05 = float((sims >= 0.5).mean()) if sims.size else 0.0
    avg_max_sim = float(sims.mean()) if sims.size else 0.0
    # hit ratio wrt original Top50 candidate set
    hr = hit_ratio_at_50(uid, rec_ids)
    hit_list.append(hr)
    records.append([uid, avg_max_sim, prop_sim_ge_03, prop_sim_ge_05, hr])

sim_hit_df = pd.DataFrame(records, columns=["user_id","avg_max_sim","prop_sim>=0.3","prop_sim>=0.5","hit_ratio_vs_top50"])

# Random sample of 5 users: show top10 rec with similarity to best seed
sample_users = rng.choice(users, size=5, replace=False)
rows = []
for uid in sample_users:
    seeds = user_seeds_from_top50(group_top50.get_group(uid), k=5)
    rec_g = group_rec.get_group(uid).head(10)
    rec_ids = rec_g["recommended_trainer_id"].tolist()
    sims = max_sim_to_seeds(rec_ids, seeds)
    for rid, s in zip(rec_ids, sims):
        rows.append([uid, tuple(seeds), rid, s])
sample_view = pd.DataFrame(rows, columns=["user_id","seed_trainers(top5)","rec_trainer_id","max_sim_to_seeds"])
display(sample_view)

# --------------------
# 2.2 与项目目标的关联
# --------------------
# Coverage & concentration
total_items = pivot.shape[1]
coverage = item_coverage(df_itemcf, total_items)

freq = df_itemcf["recommended_trainer_id"].value_counts()
hhi = float(((freq/freq.sum())**2).sum())
gini = float(gini_coefficient(freq.values))

# Novelty: average -log(popularity+1)
rec_pop = df_itemcf["recommended_trainer_id"].map(pop_plus).astype(float)
avg_novelty = float(np.mean(-np.log(rec_pop)))

# Diversity: list-wise dissimilarity via (1 - cosine) among recs
def list_diversity_cosine(user_id):
    ids = group_rec.get_group(user_id)["recommended_trainer_id"].tolist()
    ids = [i for i in ids if i in sim_df.index]
    if len(ids) < 2:
        return np.nan
    pair_d = []
    for i, j in combinations(ids, 2):
        pair_d.append(1.0 - float(sim_df.loc[i, j]))
    return float(np.mean(pair_d)) if pair_d else np.nan

div_scores = [list_diversity_cosine(uid) for uid in users]
avg_diversity = float(np.nanmean(div_scores))

# Novelty vs. user's own Top50 base (not-seen rate)
def novelty_rate(uid):
    base = set(group_top50.get_group(uid)["trainer_id"].tolist())
    rec = set(group_rec.get_group(uid)["recommended_trainer_id"].tolist())
    unseen = len(rec - base)/len(rec) if len(rec)>0 else 0.0
    return unseen
novelty_rates = [novelty_rate(uid) for uid in users]
avg_unseen_rate = float(np.mean(novelty_rates))

# Summaries
summary = {
    "avg_max_sim_to_seed": float(sim_hit_df["avg_max_sim"].mean()),
    "prop_sim>=0.3_mean": float(sim_hit_df["prop_sim>=0.3"].mean()),
    "prop_sim>=0.5_mean": float(sim_hit_df["prop_sim>=0.5"].mean()),
    "hit_ratio_vs_top50_mean": float(sim_hit_df["hit_ratio_vs_top50"].mean()),
    "coverage_item_level": coverage,
    "concentration_HHI": hhi,
    "concentration_Gini": gini,
    "avg_list_diversity(1-cosine)": avg_diversity,
    "avg_novelty(-log(popularity+1))": avg_novelty,
    "avg_unseen_rate_vs_top50": avg_unseen_rate
}

display(sim_hit_df)
summary


Unnamed: 0,user_id,seed_trainers(top5),rec_trainer_id,max_sim_to_seeds
0,U0445,"(433, 501, 871, 471, 649)",161,0.051187
1,U0445,"(433, 501, 871, 471, 649)",219,0.066993
2,U0445,"(433, 501, 871, 471, 649)",368,0.076753
3,U0445,"(433, 501, 871, 471, 649)",300,0.11785
4,U0445,"(433, 501, 871, 471, 649)",276,0.040197
5,U0445,"(433, 501, 871, 471, 649)",165,0.062149
6,U0445,"(433, 501, 871, 471, 649)",301,0.046954
7,U0445,"(433, 501, 871, 471, 649)",142,0.10296
8,U0445,"(433, 501, 871, 471, 649)",149,0.062694
9,U0445,"(433, 501, 871, 471, 649)",533,0.073182


Unnamed: 0,user_id,avg_max_sim,prop_sim>=0.3,prop_sim>=0.5,hit_ratio_vs_top50
0,U0000,0.065082,0.0,0.0,0.0
1,U0001,0.080284,0.0,0.0,0.0
2,U0002,0.080220,0.0,0.0,0.0
3,U0003,0.072681,0.0,0.0,0.0
4,U0004,0.093114,0.0,0.0,0.0
...,...,...,...,...,...
995,U0995,0.073032,0.0,0.0,0.0
996,U0996,0.076325,0.0,0.0,0.0
997,U0997,0.088995,0.0,0.0,0.0
998,U0998,0.079764,0.0,0.0,0.0


{'avg_max_sim_to_seed': 0.08151661548936204,
 'prop_sim>=0.3_mean': 0.0052,
 'prop_sim>=0.5_mean': 8e-05,
 'hit_ratio_vs_top50_mean': 0.00038,
 'coverage_item_level': 0.886,
 'concentration_HHI': 0.0043480304,
 'concentration_Gini': 0.6900798194130926,
 'avg_list_diversity(1-cosine)': 0.9605063613546219,
 'avg_novelty(-log(popularity+1))': -4.438895835386569,
 'avg_unseen_rate_vs_top50': 0.99962}

# Decision Evaluation

In [1]:
# === Decision Evaluation for CF alternatives: ItemCF vs UserCF vs SVD vs Hybrid (Unified) ===
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from IPython.display import display

# --------------------
# Config
# --------------------
TOPN = 50
USERCF_K_DEFAULT = 20     # number of neighbors (adjustable)
SVD_K_DEFAULT    = 40     # latent dimensions (will converge to valid range automatically)

# --------------------
# Load data
# --------------------
# path_inter file is optional; can be reserved for future extensions
path_inter  = "user_video_interaction.xlsx"     # optional
path_top50  = "user_top50_trainer.xlsx"         # required: contains user_id / trainer_id / total_score
path_itemcf = "itemcf_user_reco_top50.xlsx"     # required: contains user_id / recommended_trainer_id / sim_score

# Try to load optional interaction file; absence does not affect process
try:
    df_inter = pd.read_excel(path_inter)
except Exception:
    df_inter = None

df_top50  = pd.read_excel(path_top50)
df_itemcf = pd.read_excel(path_itemcf)

# --------------------
# Build user-item matrix from total_score in Top50
# --------------------
pivot = (df_top50
         .pivot_table(index="user_id", columns="trainer_id", values="total_score", fill_value=0.0)
         .astype(np.float32))
users = pivot.index.tolist()
items = pivot.columns.tolist()
n_users, n_items = pivot.shape

# Prevent dimension overflow
USERCF_K = int(min(USERCF_K_DEFAULT, max(n_users - 1, 1)))
SVD_K = int(min(SVD_K_DEFAULT, max(min(n_users, n_items) - 1, 1)))

# --------------------
# Pre-compute similarities
# --------------------
# item-item & user-user cosine similarity
item_sim = cosine_similarity(pivot.T)  # (n_items x n_items)
item_sim_df = pd.DataFrame(item_sim, index=items, columns=items)
user_sim = cosine_similarity(pivot)    # (n_users x n_users)
user_sim_df = pd.DataFrame(user_sim, index=users, columns=users)

# Seen map
seen = {uid: set(np.flatnonzero(pivot.loc[uid].values > 0).tolist()) for uid in users}

# Safe Top-K selection (handles cases when available elements < K)
def top_k_indices(pred: np.ndarray, k: int) -> np.ndarray:
    mask = np.isfinite(pred)
    m = int(min(k, int(mask.sum())))
    if m <= 0:
        return np.array([], dtype=int)
    # np.argpartition requires m-1
    idx = np.argpartition(-pred, m-1)[:m]
    # fine sorting
    idx = idx[np.argsort(-pred[idx])]
    # only keep finite values
    idx = idx[mask[idx]]
    return idx

# --------------------
# A) Existing ItemCF (provided) - standardize format
# --------------------
itemcf_rec = df_itemcf.copy()
if "trainer_id" not in itemcf_rec.columns and "recommended_trainer_id" in itemcf_rec.columns:
    itemcf_rec = itemcf_rec.rename(columns={"recommended_trainer_id": "trainer_id"})
itemcf_rec["score"] = itemcf_rec.get("sim_score", 0.0)
itemcf_rec = itemcf_rec[["user_id", "trainer_id", "score"]].copy()

# --------------------
# B) User-based CF (cosine, top-K neighbors)
#     score = sum(sim * rating) / sum(|sim|)
# --------------------
pivot_values = pivot.values  # (n_users x n_items)
usercf_rows = []
for uidx, uid in enumerate(users):
    sims = user_sim[uidx].astype(np.float32).copy()
    sims[uidx] = 0.0
    # top-K neighbors
    if USERCF_K > 0:
        # protect np.argpartition when user count is small
        K_eff = min(USERCF_K, max(len(sims) - 1, 1))
        topk_idx = np.argpartition(-sims, K_eff-1)[:K_eff]
        topk_sims = sims[topk_idx]
    else:
        topk_idx = np.array([], dtype=int)
        topk_sims = np.array([], dtype=np.float32)

    denom = np.sum(np.abs(topk_sims)) + 1e-9
    if topk_idx.size == 0 or denom <= 0:
        pred = np.full(n_items, -np.inf, dtype=np.float32)
    else:
        pred = np.dot(topk_sims, pivot_values[topk_idx, :]) / denom
        # mask seen items
        if seen[uid]:
            pred[list(seen[uid])] = -np.inf

    idx = top_k_indices(pred, TOPN)
    if idx.size:
        for i in idx:
            usercf_rows.append([uid, items[i], float(pred[i])])

usercf_rec = pd.DataFrame(usercf_rows, columns=["user_id", "trainer_id", "score"])

# --------------------
# C) Matrix Factorization via TruncatedSVD (k=SVD_K)
# --------------------
svd = TruncatedSVD(n_components=SVD_K, random_state=42)
U = svd.fit_transform(pivot_values).astype(np.float32)   # (n_users x k)
S = svd.singular_values_.astype(np.float32)              # (k,)
VT = svd.components_.astype(np.float32)                  # (k x n_items)
R_hat = (U * S) @ VT                                     # low-rank reconstruction (n_users x n_items)

mf_rows = []
for uidx, uid in enumerate(users):
    pred = R_hat[uidx].copy()
    if seen[uid]:
        pred[list(seen[uid])] = -np.inf
    idx = top_k_indices(pred, TOPN)
    if idx.size:
        for i in idx:
            mf_rows.append([uid, items[i], float(pred[i])])

mf_rec = pd.DataFrame(mf_rows, columns=["user_id", "trainer_id", "score"])

# --------------------
# D) Hybrid: 0.5 * MF_norm + 0.5 * ICF_norm, based on union candidate pool
# --------------------
def per_user_minmax_norm(df, score_col, out_col):
    df = df.copy()
    df[out_col] = df.groupby("user_id")[score_col].transform(
        lambda s: (s - s.min()) / (s.max() - s.min() + 1e-9)
    )
    return df

mf_rec2 = per_user_minmax_norm(mf_rec, "score", "mf_norm")
itemcf_rec2 = per_user_minmax_norm(itemcf_rec, "score", "icf_norm")

hybrid_rows = []
for uid in users:
    mf_u  = mf_rec2[mf_rec2["user_id"] == uid][["trainer_id", "mf_norm"]]
    icf_u = itemcf_rec2[itemcf_rec2["user_id"] == uid][["trainer_id", "icf_norm"]]
    pool = pd.merge(mf_u, icf_u, on="trainer_id", how="outer").fillna(0.0)
    if pool.empty:
        continue
    pool["hybrid"] = 0.5 * pool["mf_norm"] + 0.5 * pool["icf_norm"]
    top_pool = pool.nlargest(min(TOPN, len(pool)), "hybrid")
    for _, r in top_pool.iterrows():
        hybrid_rows.append([uid, r["trainer_id"], float(r["hybrid"])])

hybrid_rec = pd.DataFrame(hybrid_rows, columns=["user_id", "trainer_id", "score"])

# --------------------
# Metrics
# --------------------
group_top50 = df_top50.groupby("user_id")

def seed_set(uid, k=5):
    # Take Top-k by total_score as "interest seeds"
    g = group_top50.get_group(uid).sort_values("total_score", ascending=False)
    return g["trainer_id"].head(k).tolist()

def avg_max_sim(rec_df, uid, k_seed=5):
    # For each recommended trainer, take the maximum similarity relative to seeds, then average across user
    seeds = seed_set(uid, k=k_seed)
    sids = [t for t in seeds if t in item_sim_df.index]
    if not sids:
        return np.nan
    ids = [t for t in rec_df[rec_df["user_id"] == uid]["trainer_id"].tolist() if t in item_sim_df.index]
    if not ids:
        return np.nan
    return float(np.mean([item_sim_df.loc[sids, t].max() for t in ids]))

def coverage(rec_df):
    return float(rec_df["trainer_id"].nunique() / max(n_items, 1))

def HHI(rec_df):
    freq = rec_df["trainer_id"].value_counts()
    share = (freq / max(freq.sum(), 1)).values
    return float(np.sum(share ** 2))

def gini(rec_df):
    freq = rec_df["trainer_id"].value_counts().values.astype(float)
    x = np.sort(freq)
    n = len(x)
    if n == 0 or x.sum() == 0:
        return np.nan
    cumx = np.cumsum(x)
    g = (n + 1 - 2 * (cumx.sum() / cumx[-1]))
    return float(g / n)

def list_diversity(rec_df):
    # Average (1 - cosine) distance within each user list
    vals = []
    for uid, g in rec_df.groupby("user_id"):
        ids = [t for t in g["trainer_id"].tolist() if t in item_sim_df.index]
        if len(ids) < 2:
            continue
        d = []
        for i, j in combinations(ids, 2):
            d.append(1.0 - float(item_sim_df.loc[i, j]))
        if d:
            vals.append(np.mean(d))
    return float(np.mean(vals)) if vals else np.nan

def unseen_rate(rec_df):
    # Proportion of recommendations not present in baseline Top50 (df_top50)
    vals = []
    for uid, g in rec_df.groupby("user_id"):
        if uid not in group_top50.indices:
            continue
        base = set(group_top50.get_group(uid)["trainer_id"].tolist())
        rec  = set(g["trainer_id"].tolist())
        if not rec:
            continue
        vals.append(len(rec - base) / len(rec))
    return float(np.mean(vals)) if vals else np.nan

# Evaluate the four schemes
schemes = {
    "ItemCF(existing)": itemcf_rec,
    "UserCF(K=%d)" % USERCF_K: usercf_rec,
    "SVD(MF,k=%d)" % SVD_K: mf_rec,
    "Hybrid(0.5*MF+0.5*ItemCF)": hybrid_rec
}

rows = []
for name, rec in schemes.items():
    # Average similarity to user interest seeds
    ams = np.nanmean([avg_max_sim(rec, u) for u in users]) if len(users) else np.nan
    cov = coverage(rec)
    hhi = HHI(rec)
    gn  = gini(rec)
    div = list_diversity(rec)
    unsee = unseen_rate(rec)
    rows.append([name, ams, cov, hhi, gn, div, unsee])

results = pd.DataFrame(
    rows,
    columns=["Scheme", "AvgMaxSimToSeeds", "Coverage", "HHI", "Gini", "ListDiversity(1-cosine)", "UnseenRate_vs_Top50"]
)

display(results.round(4))

# Show sample recommendations for each scheme (3 users * Top10)
sample_users = users[:3]
samples = []
for name, rec in schemes.items():
    for uid in sample_users:
        # Take Top10 by descending score (if already TopN order, head(10) is sufficient)
        sub = rec[rec["user_id"] == uid].sort_values("score", ascending=False).head(10)
        top10 = sub["trainer_id"].tolist()
        samples.append([name, uid, top10])

samples_df = pd.DataFrame(samples, columns=["Scheme", "user_id", "Top10_recs"])
display(samples_df)


Unnamed: 0,Scheme,AvgMaxSimToSeeds,Coverage,HHI,Gini,ListDiversity(1-cosine),UnseenRate_vs_Top50
0,ItemCF(existing),0.0815,0.886,0.0043,0.6901,0.9605,0.9996
1,UserCF(K=20),0.1511,0.99,0.0013,0.2875,0.9674,0.9998
2,"SVD(MF,k=40)",0.1001,0.748,0.003,0.5751,0.9635,0.9997
3,Hybrid(0.5*MF+0.5*ItemCF),0.0944,0.878,0.0034,0.6309,0.9623,0.9996


Unnamed: 0,Scheme,user_id,Top10_recs
0,ItemCF(existing),U0000,"[471, 609, 133, 744, 423, 252, 444, 45, 452, 273]"
1,ItemCF(existing),U0001,"[142, 562, 459, 512, 348, 431, 538, 264, 447, ..."
2,ItemCF(existing),U0002,"[305, 273, 180, 281, 301, 96, 224, 212, 83, 133]"
3,UserCF(K=20),U0000,"[44, 195, 667, 115, 600, 452, 561, 863, 174, 6]"
4,UserCF(K=20),U0001,"[826, 419, 792, 422, 484, 252, 276, 220, 473, ..."
5,UserCF(K=20),U0002,"[276, 734, 248, 437, 983, 162, 799, 111, 952, ..."
6,"SVD(MF,k=40)",U0000,"[174, 550, 355, 273, 600, 369, 667, 276, 423, ..."
7,"SVD(MF,k=40)",U0001,"[562, 264, 209, 619, 252, 71, 840, 575, 538, 661]"
8,"SVD(MF,k=40)",U0002,"[423, 270, 355, 510, 222, 273, 619, 895, 276, ..."
9,Hybrid(0.5*MF+0.5*ItemCF),U0000,"[273.0, 423.0, 174.0, 609.0, 471.0, 550.0, 133..."
