Combining content filtering and collaborative filtering scores for hybrid scores

Outputs Top 10 rankings for each user based on hybrid score

In [1]:
import pandas as pd

excel_path = 'hybrid.xlsx'
content_df = pd.read_excel(excel_path, sheet_name='Content')
cf_df = pd.read_excel(excel_path, sheet_name='Item-CF')

def guess_field(df, like):
    candidates = [c for c in df.columns if like.lower() in c.lower()]
    if candidates:
        return candidates[0]
    return df.columns[0]

user_field = guess_field(content_df, 'user')
trainer_field = guess_field(content_df, 'trainer')
content_score_field = guess_field(content_df, 'score')
cf_score_field = guess_field(cf_df, 'score')

content_df = content_df.rename(columns={user_field:'user_id', trainer_field:'trainer_id', content_score_field:'content_score'})
cf_df = cf_df.rename(columns={user_field:'user_id', trainer_field:'trainer_id', cf_score_field:'sim_score'})

merge_df = pd.merge(
    content_df, cf_df,
    on=['user_id', 'trainer_id'],
    how='outer'
)
merge_df['content_score'] = merge_df['content_score'].fillna(0)
merge_df['sim_score'] = merge_df['sim_score'].fillna(0)

# Weighting for a mixed fraction (equal weighted average)
merge_df['hybrid_score'] = (merge_df['content_score'] + merge_df['sim_score']) / 2

# Grouped by user, top 10 per user
topn_df = merge_df.sort_values(['user_id', 'hybrid_score'], ascending=[True, False]).groupby('user_id').head(10)

topn_df.to_excel('user_top10_hybrid.xlsx', index=False)

topn_df.head()


Unnamed: 0,user_id,trainer_id,content_score,sim_score,hybrid_score
61,U0000,444,0.571429,0.512319,0.541874
13,U0000,70,0.857143,0.0,0.428571
51,U0000,394,0.571429,0.194508,0.382968
66,U0000,471,0.0,0.72654,0.36327
76,U0000,609,0.0,0.715403,0.357702


# Evaluation

In [1]:
import pandas as pd
import numpy as np

behavior_enhanced = pd.read_excel('user_top10_hybrid.xlsx', sheet_name='Evaluation')
trainer_reco_df = pd.read_excel('user_top10_hybrid.xlsx', sheet_name='hybridTop10')

# Construction of ideal set (click/watch behaviour as positive samples)
user_ideals = (
    behavior_enhanced[behavior_enhanced['action_type'].isin(['watch', 'click'])]
    .groupby('user_id')['trainer_id']
    .apply(set)
    .to_dict()
)

# Ideal set sample distribution
ideal_lens = [len(v) for v in user_ideals.values()]
ideal_stats = {
    'mean': np.mean(ideal_lens),
    'median': np.median(ideal_lens),
    'min': np.min(ideal_lens),
    'max': np.max(ideal_lens)
}

# 4. indicators（Precision@10, Recall@10, NDCG@10, MAP@10）
precision_list, recall_list, ndcg_list, ap_list = [], [], [], []

def dcg_score(recommended, ideal_set):
    return sum((1/np.log2(idx+2)) for idx, tid in enumerate(recommended) if tid in ideal_set)

def ndcg_score(recommended, ideal_set, k):
    idcg = sum(1/np.log2(i+2) for i in range(min(len(ideal_set), k)))
    dcg = dcg_score(recommended, ideal_set)
    return dcg / idcg if dcg > 0 else 0

for user_id, ideals in user_ideals.items():
    recos = trainer_reco_df[trainer_reco_df['user_id'] == user_id].sort_values('hybrid_score', ascending=False)['trainer_id'].tolist()
    if not recos or not ideals:
        continue
    hits = [tid for tid in recos if tid in ideals]
    precision = len(hits) / len(recos)
    recall = len(hits) / len(ideals) if ideals else 0
    ndcg = ndcg_score(recos, ideals, k=len(recos))
    # MAP
    num_hits, sum_precisions = 0, 0
    for idx, tid in enumerate(recos):
        if tid in ideals:
            num_hits += 1
            sum_precisions += num_hits / (idx + 1)
    ap = sum_precisions / min(len(ideals), len(recos)) if ideals else 0

    precision_list.append(precision)
    recall_list.append(recall)
    ndcg_list.append(ndcg)
    ap_list.append(ap)

precision_at10 = np.mean(precision_list)
recall_at10 = np.mean(recall_list)
ndcg_at10 = np.mean(ndcg_list)
map_at10 = np.mean(ap_list)

# Novelty
novelty_scores = []
for uid in trainer_reco_df['user_id'].unique():
    rec_trainers = set(trainer_reco_df[trainer_reco_df['user_id']==uid]['trainer_id'])
    hist_trainers = user_ideals.get(uid, set())
    novelty = len(rec_trainers - hist_trainers) / len(rec_trainers) if rec_trainers else 0
    novelty_scores.append(novelty)
novelty_stats = {
    'mean': np.mean(novelty_scores),
    'median': np.median(novelty_scores)
}

# Coverage
user_coverage = trainer_reco_df['user_id'].nunique() / behavior_enhanced['user_id'].nunique()
item_coverage = trainer_reco_df['trainer_id'].nunique() / behavior_enhanced['trainer_id'].nunique()


In [2]:
results_trainer_enhanced = {
    'Ideal set sample distribution': ideal_stats,
    'Precision@10': precision_at10,
    'Recall@10': recall_at10,
    'NDCG@10': ndcg_at10,
    'MAP@10': map_at10,
    'novelty': novelty_stats,
    'User Coverage': user_coverage,
    'Trainer Coverage': item_coverage
}


display(results_trainer_enhanced)

{'Ideal set sample distribution': {'mean': 56.718,
  'median': 57.0,
  'min': 34,
  'max': 79},
 'Precision@10': 0.2097,
 'Recall@10': 0.03747740388550982,
 'NDCG@10': 0.20619795825271678,
 'MAP@10': 0.0825072619047619,
 'novelty': {'mean': 0.7903000000000001, 'median': 0.8},
 'User Coverage': 1.0,
 'Trainer Coverage': 0.816}