# Functional Testing

In [1]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np

# ------------- 0. Load data -------------
file_path = "Hybrid_Reco_Videos_Data.xlsx"
videos = pd.read_excel(file_path, sheet_name="videos' info")
rates = pd.read_excel(file_path, sheet_name="filtered_rate_data")
favs = pd.read_excel(file_path, sheet_name="user_favorite_videos")
top10 = pd.read_excel(file_path, sheet_name="all_users_top10_recommendations")
behav = pd.read_excel(file_path, sheet_name="User behavior")

summary = {
    "videos_info": videos.shape,
    "filtered_rate_data": rates.shape,
    "user_favorite_videos": favs.shape,
    "all_users_top10_recommendations": top10.shape,
    "user_behavior": behav.shape,
}
summary


{'videos_info': (10000, 3),
 'filtered_rate_data': (13893, 5),
 'user_favorite_videos': (1000, 2),
 'all_users_top10_recommendations': (10010, 8),
 'user_behavior': (58605, 3)}

In [2]:
# ------------- 1.1 Data Integrity and Input/Output Validation -------------

def data_integrity_report():
    rep = {}
    # Missing value statistics
    rep['missing'] = {
        "videos_info": videos.isna().sum().to_dict(),
        "filtered_rate_data": rates.isna().sum().to_dict(),
        "user_favorite_videos": favs.isna().sum().to_dict(),
        "all_users_top10_recommendations": top10.isna().sum().to_dict(),
        "user_behavior": behav.isna().sum().to_dict()
    }
    # Primary key duplicates
    rep['duplicates'] = {
        "videos.video_id": videos['video_id'].duplicated().sum(),
        "rates.user_video": rates.duplicated(subset=['user_id','video_id']).sum(),
        "favs.user": favs['user_id'].duplicated().sum()
    }
    # Cross-table ID consistency
    rep['cross_sheet'] = {
        "rates.video_in_videos_info_missing": int(~rates['video_id'].isin(videos['video_id']).any()),
        "behav.video_missing_in_videos_info": int((~behav['video_id'].isin(videos['video_id'])).sum()),
        "top10.video_missing_in_videos_info": int((~top10['video_id'].isin(videos['video_id'])).sum()),
        "favs.title_not_in_videos_title": int((~favs['favorite_title'].isin(videos['title'])).sum()),
    }
    # Input-output validity example: take top10 for one user
    sample_user = top10['user_id'].iloc[0]
    sample_result = top10[top10['user_id']==sample_user].sort_values('rank').head(10)
    return rep, sample_user, sample_result

integrity_rep, sample_user, sample_result = data_integrity_report()
integrity_rep, sample_user, sample_result.head(5)


({'missing': {'videos_info': {'video_id': 0, 'title': 0, 'specialities': 0},
   'filtered_rate_data': {'user_id': 0,
    'video_id': 0,
    'trainer_id': 0,
    'action_type': 0,
    'rating_score': 0},
   'user_favorite_videos': {'user_id': 0, 'favorite_title': 154},
   'all_users_top10_recommendations': {'user_id': 0,
    'rank': 0,
    'video_id': 0,
    'title': 0,
    'hybrid_score': 0,
    'popularity_score': 0,
    'collab_score': 0,
    'content_score': 0},
   'user_behavior': {'user_id': 0, 'video_id': 0, 'action_type': 0}},
  'duplicates': {'videos.video_id': 0, 'rates.user_video': 0, 'favs.user': 0},
  'cross_sheet': {'rates.video_in_videos_info_missing': 0,
   'behav.video_missing_in_videos_info': 0,
   'top10.video_missing_in_videos_info': 0,
   'favs.title_not_in_videos_title': 154}},
 'U0296',
   user_id  rank  video_id                                   title  \
 0   U0296     1     42505            Cardio Training Video #42505   
 1   U0296     2     82210           Pil

In [3]:
# ------------- 1.2 Functional Testing of Each Module: Implement Module Functions -------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Popularity ---
video_stats = (rates.groupby('video_id')['rating_score']
               .agg(avg_rating='mean', num_ratings='count')
               .reset_index())
popularity_join = videos[['video_id','title','specialities']].merge(video_stats, on='video_id', how='left')
popularity_join['num_ratings'] = popularity_join['num_ratings'].fillna(0).astype(int)
popularity_join['avg_rating'] = popularity_join['avg_rating'].fillna(popularity_join['avg_rating'].mean())

def get_top_popular_videos(df=popularity_join, n=10, min_ratings=5):
    filtered = df[df['num_ratings']>=min_ratings].copy()
    ranked = filtered.sort_values(['avg_rating','num_ratings'], ascending=[False, False]).head(n)
    return ranked[['video_id','title','avg_rating','num_ratings']]

top_pop_sample = get_top_popular_videos(n=10, min_ratings=5)

# --- Content-based ---
content_df = videos.copy()
content_df['specialities'] = (content_df['specialities'].fillna('')
                              .str.lower().str.replace(r'[^\w\s|]', ' ', regex=True)
                              .str.replace(r'\|', ' ', regex=True)
                              .str.replace(r'\s+', ' ', regex=True).str.strip())

vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
tfidf = vectorizer.fit_transform(content_df['specialities'])
title_to_index = pd.Series(content_df.index, index=content_df['title'])

def get_content_recs_by_title(title, n=10):
    if title not in title_to_index:
        return pd.DataFrame(columns=['video_id','title','score'])
    idx = title_to_index[title]
    sims = cosine_similarity(tfidf[idx], tfidf).ravel()
    sims[idx] = -1  # exclude itself
    top_idx = sims.argsort()[::-1][:n]
    out = content_df.iloc[top_idx][['video_id','title']].copy()
    out['score'] = sims[top_idx]
    return out

# Prepare a valid favorite title
valid_fav_title = favs['favorite_title'].dropna()
valid_fav_title = valid_fav_title[valid_fav_title.isin(videos['title'])]
fav_title_example = valid_fav_title.sample(1, random_state=42).iloc[0] if len(valid_fav_title)>0 else None
content_recs_sample = get_content_recs_by_title(fav_title_example, n=5) if fav_title_example else pd.DataFrame()

# --- Collaborative Filtering ---
# Try to import surprise; fallback to baseline if not available
use_surprise = True
try:
    from surprise import Dataset, Reader, SVD
    from surprise.model_selection import train_test_split
    from surprise import accuracy
except Exception as e:
    use_surprise = False
    surprise_error = str(e)

def train_svd_model(ratings_df):
    rdr = Reader(rating_scale=(ratings_df['rating_score'].min(), ratings_df['rating_score'].max()))
    data = Dataset.load_from_df(ratings_df[['user_id','video_id','rating_score']], rdr)
    trainset = data.build_full_trainset()
    algo = SVD(random_state=42)
    algo.fit(trainset)
    return algo, trainset

def get_collab_recs_svd(user_id, n=10):
    # Candidate set: all videos the user has not rated
    if user_id not in rates['user_id'].unique():
        return pd.DataFrame(columns=['video_id','title','score'])
    # Build anti-testset
    inner_uid = svd_trainset.to_inner_uid(user_id)
    unseen_inner_iids = [i for i in svd_trainset.all_items()
                         if svd_trainset.ur[inner_uid] is not None and i not in [j for (j, _) in svd_trainset.ur[inner_uid]]]
    # If ur is empty (extreme case), recommend all
    if len(unseen_inner_iids)==0:
        unseen_inner_iids = list(svd_trainset.all_items())
    preds = []
    for inner_iid in unseen_inner_iids:
        raw_iid = svd_trainset.to_raw_iid(inner_iid)
        preds.append((raw_iid, svd_algo.predict(user_id, raw_iid).est))
    recs = sorted(preds, key=lambda x: x[1], reverse=True)[:n]
    out = pd.DataFrame({'video_id':[int(i) for i,_ in recs],'score':[s for _,s in recs]})
    out = out.merge(videos[['video_id','title']], on='video_id', how='left')
    return out

# Baseline (if surprise not supported)
global_mean = rates['rating_score'].mean()
item_mean = rates.groupby('video_id')['rating_score'].mean()

def get_collab_recs_baseline(user_id, n=10):
    seen = set(rates.loc[rates['user_id']==user_id, 'video_id'].values)
    candidates = item_mean[~item_mean.index.isin(seen)].sort_values(ascending=False).head(n)
    out = pd.DataFrame({'video_id': candidates.index.astype(int), 'score': candidates.values})
    out = out.merge(videos[['video_id','title']], on='video_id', how='left')
    return out

if use_surprise:
    svd_algo, svd_trainset = train_svd_model(rates)
    collab_recs_sample = get_collab_recs_svd(sample_user, n=5)
else:
    collab_recs_sample = get_collab_recs_baseline(sample_user, n=5)

# --- Hybrid with fallback ---
def minmax_normalize(s):
    s = s.astype(float)
    return (s - s.min()) / (s.max() - s.min() + 1e-8)

def hybrid_recommender(user_id, n=10, weights=(0.33, 0.33, 0.34)):
    pw, cw, rw = weights
    total = pw+cw+rw
    if total <= 0:
        raise ValueError("Invalid weights.")
    pw, cw, rw = pw/total, cw/total, rw/total

    # Popularity
    pop_scores = video_stats[['video_id','avg_rating']].copy()
    pop_scores['popularity_score'] = minmax_normalize(pop_scores['avg_rating'])
    pop_scores = pop_scores[['video_id','popularity_score']]

    # Collaborative
    if user_id in rates['user_id'].unique():
        if use_surprise:
            collab = get_collab_recs_svd(user_id, n=len(videos)) # get as many candidates as possible
        else:
            collab = get_collab_recs_baseline(user_id, n=len(videos))
        collab = collab[['video_id','score']].rename(columns={'score':'collab_score'})
        collab['collab_score'] = minmax_normalize(collab['collab_score'])
        has_history = True
    else:
        collab = pd.DataFrame(columns=['video_id','collab_score'])
        has_history = False

    # Content (use user favorites if available, otherwise empty)
    fav_title = None
    if user_id in favs['user_id'].values:
        fav_title = favs.loc[favs['user_id']==user_id, 'favorite_title'].dropna()
        fav_title = fav_title[fav_title.isin(videos['title'])]
        fav_title = fav_title.iloc[0] if len(fav_title)>0 else None
    if fav_title is not None:
        cont = get_content_recs_by_title(fav_title, n=200)[['video_id','score']].rename(columns={'score':'content_score'})
        cont['content_score'] = minmax_normalize(cont['content_score'])
    else:
        cont = pd.DataFrame(columns=['video_id','content_score'])

    # Merge
    base = videos[['video_id','title']].copy()
    merged = (base
              .merge(pop_scores, on='video_id', how='left')
              .merge(collab, on='video_id', how='left')
              .merge(cont, on='video_id', how='left'))
    merged[['popularity_score','collab_score','content_score']] = merged[['popularity_score','collab_score','content_score']].fillna(0.0)

    # Cold-start: use only Popularity
    if not has_history:
        merged['hybrid_score'] = merged['popularity_score']
    else:
        merged['hybrid_score'] = pw*merged['popularity_score'] + rw*merged['collab_score'] + cw*merged['content_score']

    out = merged.sort_values('hybrid_score', ascending=False).head(n)
    return out[['video_id','title','hybrid_score','popularity_score','collab_score','content_score']]

hybrid_sample = hybrid_recommender(sample_user, n=5)

# ------------- 1.3 Fault Case Analysis -------------

# 1) Favorite title not found
missing_favs = favs['favorite_title'].isna().sum() + (~favs['favorite_title'].isin(videos['title'])).sum()

# 2) Cold-start users (no ratings)
all_users_with_ratings = set(rates['user_id'].unique())
all_users = set(favs['user_id']).union(set(behav['user_id'])).union(set(top10['user_id']))
cold_start_users = sorted(list(all_users - all_users_with_ratings))
cold_user_example = cold_start_users[0] if len(cold_start_users)>0 else None
hybrid_cold = hybrid_recommender(cold_user_example, n=5) if cold_user_example else pd.DataFrame()

# 3) Videos without ratings
videos_without_ratings = set(videos['video_id']) - set(rates['video_id'])
count_videos_without_ratings = len(videos_without_ratings)

# 4) Extreme weights (1,0,0)
hybrid_pop_only = hybrid_recommender(sample_user, n=5, weights=(1,0,0))
hybrid_collab_only = hybrid_recommender(sample_user, n=5, weights=(0,0,1))
hybrid_content_only = hybrid_recommender(sample_user, n=5, weights=(0,1,0))

# 5) Popularity small-sample anomaly check: does Top50 contain videos with num_ratings=1
pop_top50 = get_top_popular_videos(n=50, min_ratings=1)
pop_top50_with_small_sample = pop_top50[pop_top50['num_ratings']==1]

# Summarize key metrics
functional_summary = {
    "missing_favorite_titles_or_not_found": int(missing_favs),
    "cold_start_user_count": len(cold_start_users),
    "cold_start_user_example": cold_user_example,
    "videos_without_any_ratings": int(count_videos_without_ratings),
    "pop_top50_small_sample_count": int(pop_top50_with_small_sample.shape[0]),
    "surprise_available": use_surprise
}

functional_summary


  merged[['popularity_score','collab_score','content_score']] = merged[['popularity_score','collab_score','content_score']].fillna(0.0)


{'missing_favorite_titles_or_not_found': 308,
 'cold_start_user_count': 1,
 'cold_start_user_example': 'U1000',
 'videos_without_any_ratings': 3957,
 'pop_top50_small_sample_count': 36,
 'surprise_available': True}

In [4]:
# Display key sample tables
print("Sample - Popularity Top10 (min_ratings=5)")
display(top_pop_sample)
if fav_title_example:
    display(content_recs_sample)
print("Sample - Collaborative Top5")
display(collab_recs_sample)
print("Sample - Hybrid Top5 (default weights)")
display(hybrid_sample)

if not hybrid_cold.empty:
    display(hybrid_cold)

if not pop_top50_with_small_sample.empty:
    display(pop_top50_with_small_sample)

# Extra diagnostic: number of duplicate titles
dup_title_count = videos['title'].duplicated().sum()
dup_title_count


Sample - Popularity Top10 (min_ratings=5)


Unnamed: 0,video_id,title,avg_rating,num_ratings
4244,42505,Cardio Training Video #42505,4.72,5
2084,20905,Weight Loss Training Video #20905,4.46,5
7601,76102,Powerlifting Training Video #76102,4.38,5
8219,82210,Pilates Training Video #82210,4.38,5
6573,65804,Rehabilitation Training Video #65804,4.34,5
396,4007,Flexibility Training Video #4007,4.32,5
423,4304,Pilates Training Video #4304,4.3,5
3574,35805,Strength Training Training Video #35805,4.285714,7
2698,27009,Rehabilitation Training Video #27009,4.28,5
295,3006,CrossFit Training Video #3006,4.26,5


Unnamed: 0,video_id,title,score
7088,70909,Yoga Training Video #70909,1.0
3072,30803,Yoga Training Video #30803,1.0
6921,69302,Elderly Fitness Training Video #69302,1.0
2045,20506,Yoga Training Video #20506,1.0
7734,77405,Elderly Fitness Training Video #77405,1.0


Sample - Collaborative Top5


Unnamed: 0,video_id,score,title
0,73602,3.907086,Pilates Training Video #73602
1,66105,3.889024,Pilates Training Video #66105
2,52506,3.875605,Flexibility Training Video #52506
3,55003,3.866376,Flexibility Training Video #55003
4,91007,3.861769,Rehabilitation Training Video #91007


Sample - Hybrid Top5 (default weights)


Unnamed: 0,video_id,title,hybrid_score,popularity_score,collab_score,content_score
6736,67407,Pilates Training Video #67407,0.845246,0.822222,0.717391,1.0
7769,77710,Pilates Training Video #77710,0.783743,0.762963,0.594014,1.0
5857,58608,Pilates Training Video #58608,0.780702,0.711111,0.635397,1.0
2434,24405,CrossFit Training Video #24405,0.764372,1.0,0.844232,0.446465
3270,32801,HIIT Training Video #32801,0.740035,0.711111,0.515789,1.0


Unnamed: 0,video_id,title,hybrid_score,popularity_score,collab_score,content_score
416,4207,CrossFit Training Video #4207,1.0,1.0,0.0,0.0
7624,76305,Prenatal Fitness Training Video #76305,1.0,1.0,0.0,0.0
6728,67309,Rehabilitation Training Video #67309,1.0,1.0,0.0,0.0
7663,76704,Prenatal Fitness Training Video #76704,1.0,1.0,0.0,0.0
6694,67005,Powerlifting Training Video #67005,1.0,1.0,0.0,0.0


Unnamed: 0,video_id,title,avg_rating,num_ratings
58,609,CrossFit Training Video #609,5.0,1
102,1103,Sports-Specific Training Video #1103,5.0,1
180,1901,Yoga Training Video #1901,5.0,1
308,3109,Prenatal Fitness Training Video #3109,5.0,1
397,4008,Flexibility Training Video #4008,5.0,1
416,4207,CrossFit Training Video #4207,5.0,1
427,4308,Elderly Fitness Training Video #4308,5.0,1
491,5002,CrossFit Training Video #5002,5.0,1
543,5504,Rehabilitation Training Video #5504,5.0,1
555,5606,Functional Training Training Video #5606,5.0,1


0

In [6]:
# Self-exclusion check (Content-based)
if fav_title_example:
    contains_self = (content_recs_sample['title'] == fav_title_example).any()
else:
    contains_self = False

# Display extreme weight results
print("Hybrid Top5 - Popularity only (1,0,0)")
display(hybrid_pop_only)
print("Hybrid Top5 - Collaborative only (0,0,1)")
display(hybrid_collab_only)
print("Hybrid Top5 - Content only (0,1,0)")
display(hybrid_content_only)

{"content_based_excludes_self": (not contains_self)}


Hybrid Top5 - Popularity only (1,0,0)


Unnamed: 0,video_id,title,hybrid_score,popularity_score,collab_score,content_score
416,4207,CrossFit Training Video #4207,1.0,1.0,0.663644,0.423923
7624,76305,Prenatal Fitness Training Video #76305,1.0,1.0,0.738615,0.0
6728,67309,Rehabilitation Training Video #67309,1.0,1.0,0.746554,0.0
7663,76704,Prenatal Fitness Training Video #76704,1.0,1.0,0.656218,0.0
6694,67005,Powerlifting Training Video #67005,1.0,1.0,0.572582,0.0


Hybrid Top5 - Collaborative only (0,0,1)


Unnamed: 0,video_id,title,hybrid_score,popularity_score,collab_score,content_score
7351,73602,Pilates Training Video #73602,1.0,0.933333,1.0,0.0
6604,66105,Pilates Training Video #66105,0.983827,0.927778,0.983827,0.0
5245,52506,Flexibility Training Video #52506,0.971811,0.9,0.971811,0.0
5492,55003,Flexibility Training Video #55003,0.963548,0.804444,0.963548,0.0
9096,91007,Rehabilitation Training Video #91007,0.959422,0.896296,0.959422,0.0


Hybrid Top5 - Content only (0,1,0)


Unnamed: 0,video_id,title,hybrid_score,popularity_score,collab_score,content_score
6736,67407,Pilates Training Video #67407,1.0,0.822222,0.717391,1.0
3143,31504,HIIT Training Video #31504,1.0,0.0,0.0,1.0
7769,77710,Pilates Training Video #77710,1.0,0.762963,0.594014,1.0
1265,12706,CrossFit Training Video #12706,1.0,0.666667,0.557014,1.0
3363,33704,CrossFit Training Video #33704,1.0,0.0,0.0,1.0


{'content_based_excludes_self': True}

In [7]:
# Targeted diagnosis of whether videos without ratings affect fusion: Output the complete rating table for the specified user, then observe the three scores for a video without a rating.
def hybrid_scores_full(user_id, weights=(0.33,0.33,0.34)):
    pw, cw, rw = weights
    total = pw+cw+rw
    pw, cw, rw = pw/total, cw/total, rw/total

    # Popularity
    pop_scores = video_stats[['video_id','avg_rating']].copy()
    pop_scores['popularity_score'] = minmax_normalize(pop_scores['avg_rating'])
    pop_scores = pop_scores[['video_id','popularity_score']]

    # Collaborative
    if user_id in rates['user_id'].unique():
        if use_surprise:
            collab = get_collab_recs_svd(user_id, n=len(videos))
        else:
            collab = get_collab_recs_baseline(user_id, n=len(videos))
        collab = collab[['video_id','score']].rename(columns={'score':'collab_score'})
        collab['collab_score'] = minmax_normalize(collab['collab_score'])
        has_history = True
    else:
        collab = pd.DataFrame(columns=['video_id','collab_score'])
        has_history = False

    # Content
    fav_title = None
    if user_id in favs['user_id'].values:
        fav_title = favs.loc[favs['user_id']==user_id, 'favorite_title'].dropna()
        fav_title = fav_title[fav_title.isin(videos['title'])]
        fav_title = fav_title.iloc[0] if len(fav_title)>0 else None
    if fav_title is not None:
        cont = get_content_recs_by_title(fav_title, n=200)[['video_id','score']].rename(columns={'score':'content_score'})
        cont['content_score'] = minmax_normalize(cont['content_score'])
    else:
        cont = pd.DataFrame(columns=['video_id','content_score'])

    base = videos[['video_id','title']].copy()
    merged = (base
              .merge(pop_scores, on='video_id', how='left')
              .merge(collab, on='video_id', how='left')
              .merge(cont, on='video_id', how='left'))
    merged[['popularity_score','collab_score','content_score']] = merged[['popularity_score','collab_score','content_score']].fillna(0.0)
    if not has_history:
        merged['hybrid_score'] = merged['popularity_score']
    else:
        merged['hybrid_score'] = pw*merged['popularity_score'] + rw*merged['collab_score'] + cw*merged['content_score']
    return merged

no_rate_video = next(iter(videos_without_ratings)) if len(videos_without_ratings)>0 else None
scores_df = hybrid_scores_full(sample_user)
row_no_rate = scores_df[scores_df['video_id']==no_rate_video]

row_no_rate


Unnamed: 0,video_id,title,popularity_score,collab_score,content_score,hybrid_score
9824,98305,Powerlifting Training Video #98305,0.0,0.0,0.0,0.0


In [8]:
# Find a collection title that does not exist in videos (or is missing) for testing.
missing_fav_rows = favs[(favs['favorite_title'].isna()) | (~favs['favorite_title'].isin(videos['title']))]
missing_fav_example = missing_fav_rows.head(1)
missing_title = None if missing_fav_example.empty else missing_fav_example['favorite_title'].iloc[0]
user_with_missing_fav = None if missing_fav_example.empty else missing_fav_example['user_id'].iloc[0]

# Run content recall for users with missing collections (should return an empty table, no error reported)
content_missing_case = get_content_recs_by_title(missing_title, n=5) if missing_title is not None else pd.DataFrame()

{
    "missing_fav_example": None if missing_fav_example.empty else missing_fav_example.to_dict(orient="records"),
    "content_rec_rows_returned": 0 if content_missing_case is None or content_missing_case.empty else content_missing_case.shape[0],
    "error_when_missing": False
}


{'missing_fav_example': [{'user_id': 'U0013', 'favorite_title': nan}],
 'content_rec_rows_returned': 0,
 'error_when_missing': False}

In [9]:

duplicates_report = {
    "videos.video_id_duplicated": int(videos['video_id'].duplicated().sum()),
    "rates.user_video_duplicated": int(rates.duplicated(subset=['user_id','video_id']).sum()),
    "favs.user_duplicated": int(favs['user_id'].duplicated().sum())
}
duplicates_report


{'videos.video_id_duplicated': 0,
 'rates.user_video_duplicated': 0,
 'favs.user_duplicated': 0}

# Behavioral Testing

In [11]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from collections import Counter

# ---------- Load data ----------
file_path = "Hybrid_Reco_Videos_Data.xlsx"
videos = pd.read_excel(file_path, sheet_name="videos' info")
rates = pd.read_excel(file_path, sheet_name="filtered_rate_data")
recs = pd.read_excel(file_path, sheet_name="all_users_top10_recommendations")
behav = pd.read_excel(file_path, sheet_name="User behavior")

# Helper: split video tags
def split_tags(s):
    if pd.isna(s): 
        return []
    return [t.strip().lower() for t in str(s).split('|') if t.strip()]

videos['tags'] = videos['specialities'].apply(split_tags)

# ------------- 2.2 System Output Analysis -------------
# A) Top-N recommendation quality
# 1) Catalog coverage
catalog_size = videos['video_id'].nunique()
rec_videos = recs['video_id'].nunique()
catalog_coverage = rec_videos / catalog_size

# 2) Popularity distribution (by decile of popularity_score)
# popularity_score is already provided in recs (0-1). Divide into 10 deciles
recs['pop_decile'] = pd.qcut(recs['popularity_score'].rank(method='first'), 10, labels=False)
pop_distribution = recs.groupby('pop_decile').size().rename('count').reset_index()
pop_distribution['ratio'] = pop_distribution['count'] / len(recs)

# 3) Diversity (tag distribution/entropy, avg. unique tags per user Top10)
rec_with_tags = recs.merge(videos[['video_id','tags']], on='video_id', how='left')
# 3.1 Global tag distribution entropy
all_tags = [tag for tags in rec_with_tags['tags'] for tag in tags]
tag_counts = Counter(all_tags)
tag_probs = np.array(list(tag_counts.values()), dtype=float)
tag_probs = tag_probs / tag_probs.sum() if tag_probs.sum()>0 else tag_probs
global_tag_entropy = -(tag_probs * np.log(tag_probs + 1e-12)).sum()

# 3.2 User-level diversity: unique tags in Top10 & avg. Jaccard similarity (lower = more diverse)
def user_diversity(df):
    tags_list = df['tags'].tolist()
    uniq_tags = set()
    for ts in tags_list:
        uniq_tags |= set(ts)
    # Compute avg. Jaccard similarity (based on tags)
    sims = []
    for i in range(len(tags_list)):
        for j in range(i+1, len(tags_list)):
            a, b = set(tags_list[i]), set(tags_list[j])
            if len(a)==0 and len(b)==0:
                sims.append(1.0)
            else:
                sims.append(len(a & b) / (len(a | b)) if len(a | b)>0 else 0.0)
    avg_jaccard = float(np.mean(sims)) if sims else np.nan
    return pd.Series({'unique_tags': len(uniq_tags), 'avg_jaccard': avg_jaccard})

user_div = rec_with_tags.groupby('user_id').apply(user_diversity).reset_index()
avg_unique_tags = user_div['unique_tags'].mean()
avg_jaccard = user_div['avg_jaccard'].mean()

# B) Compare with user behavior (CTR / Watch rate)
# Align recommendation results with behavior logs by user_id, video_id
behav_click = behav[behav['action_type']=='click']
behav_watch = behav[behav['action_type']=='watch']

rec_key = recs[['user_id','video_id']].copy()
rec_key['recommended'] = 1

# CTR (recommendation → click)
rec_clicked = rec_key.merge(behav_click, on=['user_id','video_id'], how='left', indicator=True)
ctr = (rec_clicked['_merge']=='both').mean()

# Watch rate (recommendation → watch)
rec_watched = rec_key.merge(behav_watch, on=['user_id','video_id'], how='left', indicator=True)
watch_rate = (rec_watched['_merge']=='both').mean()

# User-level CTR / Watch rate
user_ctr = rec_clicked.groupby('user_id').apply(lambda x: (x['_merge']=='both').mean()).rename('ctr').reset_index()
user_watch = rec_watched.groupby('user_id').apply(lambda x: (x['_merge']=='both').mean()).rename('watch_rate').reset_index()
user_behav_metrics = user_ctr.merge(user_watch, on='user_id', how='outer').fillna(0.0)

# C) Performance across user cohorts
users_with_ratings = set(rates['user_id'].unique())
users_in_recs = set(recs['user_id'].unique())
cold_users = sorted(list(users_in_recs - users_with_ratings))
non_cold_users = sorted(list(users_in_recs & users_with_ratings))

# High-freq / low-freq: based on rating count quantile
user_rate_counts = rates.groupby('user_id').size().rename('rate_cnt')
if not user_rate_counts.empty:
    thr = user_rate_counts.quantile(0.67)
    high_users = set(user_rate_counts[user_rate_counts>=thr].index)
    low_users = set(user_rate_counts[user_rate_counts<thr].index)
else:
    high_users, low_users = set(), set()

# Assign CTR / Watch rate to cohorts
def cohort_metrics(user_list, name):
    if not user_list:
        return {'cohort': name, 'size': 0, 'ctr': np.nan, 'watch_rate': np.nan}
    df = user_behav_metrics[user_behav_metrics['user_id'].isin(user_list)]
    return {'cohort': name, 'size': int(len(df)), 'ctr': float(df['ctr'].mean()), 'watch_rate': float(df['watch_rate'].mean())}

cohorts = []
cohorts.append(cohort_metrics(cold_users, 'cold_users'))
cohorts.append(cohort_metrics(non_cold_users, 'non_cold_users'))
cohorts.append(cohort_metrics(list(high_users & set(non_cold_users)), 'high_freq_users'))
cohorts.append(cohort_metrics(list(low_users & set(non_cold_users)), 'low_freq_users'))
cohort_df = pd.DataFrame(cohorts)

# D) Cold-start performance: contribution share from Popularity / CF / Content
# Basis: in recs, check if each score > 0
def source_share(df):
    n = len(df)
    pop = (df['popularity_score']>0).mean() if n>0 else np.nan
    col = (df['collab_score']>0).mean() if n>0 else np.nan
    cnt = (df['content_score']>0).mean() if n>0 else np.nan
    return pd.Series({'pop>0_ratio':pop,'collab>0_ratio':col,'content>0_ratio':cnt})

cold_source = recs[recs['user_id'].isin(cold_users)].pipe(source_share)
noncold_source = recs[recs['user_id'].isin(non_cold_users)].pipe(source_share)

# Summary results
behavior_summary = {
    "catalog_size": int(catalog_size),
    "rec_video_coverage": float(catalog_coverage),
    "global_tag_entropy": float(global_tag_entropy),
    "avg_unique_tags_per_user_top10": float(avg_unique_tags),
    "avg_intra_list_jaccard": float(avg_jaccard),
    "overall_ctr": float(ctr),
    "overall_watch_rate": float(watch_rate),
    "cold_users_count": int(len(cold_users)),
    "non_cold_users_count": int(len(non_cold_users)),
}

behavior_summary, pop_distribution.head(), cohort_df, cold_source.to_dict(), noncold_source.to_dict()


  user_div = rec_with_tags.groupby('user_id').apply(user_diversity).reset_index()
  user_ctr = rec_clicked.groupby('user_id').apply(lambda x: (x['_merge']=='both').mean()).rename('ctr').reset_index()
  user_watch = rec_watched.groupby('user_id').apply(lambda x: (x['_merge']=='both').mean()).rename('watch_rate').reset_index()


({'catalog_size': 10000,
  'rec_video_coverage': 0.3974,
  'global_tag_entropy': 2.7296749196215493,
  'avg_unique_tags_per_user_top10': 9.779220779220779,
  'avg_intra_list_jaccard': 0.3879594479594479,
  'overall_ctr': 0.005394605394605395,
  'overall_watch_rate': 0.2519968051118211,
  'cold_users_count': 1,
  'non_cold_users_count': 1000},
    pop_decile  count  ratio
 0           0   1001    0.1
 1           1   1001    0.1
 2           2   1001    0.1
 3           3   1001    0.1
 4           4   1001    0.1,
             cohort  size       ctr  watch_rate
 0       cold_users     1  0.000000    0.200000
 1   non_cold_users  1000  0.005400    0.251991
 2  high_freq_users   336  0.006548    0.253869
 3   low_freq_users   664  0.004819    0.251041,
 {'pop>0_ratio': 1.0, 'collab>0_ratio': 0.0, 'content>0_ratio': 0.0},
 {'pop>0_ratio': 0.3931, 'collab>0_ratio': 0.4609, 'content>0_ratio': 0.5266})

In [15]:
# Output distribution tables and user-level behavior metrics for inspection
print("Popularity decile distribution in Top-10")
display(pop_distribution)
print("User-level CTR & WatchRate")
display(user_behav_metrics)
print("Cohort metrics")
display(cohort_df)

# Compute per-user Top-10 tag coverage/diversity
def per_user_tag_coverage(df):
    tag_set = set([t for tags in df['tags'] for t in tags])
    return pd.Series({'unique_tags_in_top10': len(tag_set)})

user_tag_cov = rec_with_tags.groupby('user_id').apply(per_user_tag_coverage).reset_index()
display("Per-user Top-10 unique tag counts", user_tag_cov)

# Verify cold-start user recommendation list source
if len(cold_users)>0:
    cold_example = cold_users[0]
    cold_list = recs[recs['user_id']==cold_example][['rank','video_id','popularity_score','collab_score','content_score']].copy()
    display(f"Cold-start user {cold_example} - score composition", cold_list)


Popularity decile distribution in Top-10


Unnamed: 0,pop_decile,count,ratio
0,0,1001,0.1
1,1,1001,0.1
2,2,1001,0.1
3,3,1001,0.1
4,4,1001,0.1
5,5,1001,0.1
6,6,1001,0.1
7,7,1001,0.1
8,8,1001,0.1
9,9,1001,0.1


User-level CTR & WatchRate


Unnamed: 0,user_id,ctr,watch_rate
0,U0000,0.0,0.2
1,U0001,0.0,0.3
2,U0002,0.0,0.3
3,U0003,0.0,0.3
4,U0004,0.0,0.3
...,...,...,...
996,U0996,0.0,0.2
997,U0997,0.0,0.3
998,U0998,0.0,0.3
999,U0999,0.0,0.3


Cohort metrics


Unnamed: 0,cohort,size,ctr,watch_rate
0,cold_users,1,0.0,0.2
1,non_cold_users,1000,0.0054,0.251991
2,high_freq_users,336,0.006548,0.253869
3,low_freq_users,664,0.004819,0.251041


  user_tag_cov = rec_with_tags.groupby('user_id').apply(per_user_tag_coverage).reset_index()


'Per-user Top-10 unique tag counts'

Unnamed: 0,user_id,unique_tags_in_top10
0,U0000,12
1,U0001,10
2,U0002,12
3,U0003,9
4,U0004,4
...,...,...
996,U0996,8
997,U0997,13
998,U0998,6
999,U0999,10


'Cold-start user U1000 - score composition'

Unnamed: 0,rank,video_id,popularity_score,collab_score,content_score
910,1,42505,1.0,0.0,0.0
911,2,20905,0.74,0.0,0.0
912,3,76102,0.66,0.0,0.0
913,4,82210,0.66,0.0,0.0
914,5,65804,0.62,0.0,0.0
915,6,4007,0.6,0.0,0.0
916,7,4304,0.58,0.0,0.0
917,8,35805,0.565714,0.0,0.0
918,9,27009,0.56,0.0,0.0
919,10,3006,0.54,0.0,0.0


# Decision Evaluation

In [16]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from collections import Counter
from scipy import sparse
from sklearn.preprocessing import normalize

# ---------- Load data ----------
file_path = "Hybrid_Reco_Videos_Data.xlsx"
videos = pd.read_excel(file_path, sheet_name="videos' info")
rates = pd.read_excel(file_path, sheet_name="filtered_rate_data")
recs_hybrid = pd.read_excel(file_path, sheet_name="all_users_top10_recommendations")
behav = pd.read_excel(file_path, sheet_name="User behavior")

# ---------- Prepare lookups ----------
users = sorted(rates['user_id'].unique())
items = sorted(rates['video_id'].unique())
uid2idx = {u:i for i,u in enumerate(users)}
iid2idx = {i:j for j,i in enumerate(items)}
idx2uid = {i:u for u,i in uid2idx.items()}
idx2iid = {j:i for i,j in iid2idx.items()}

# user set to evaluate: use same users as hybrid recommendations (for fair compare)
eval_users = sorted(recs_hybrid['user_id'].unique())

# ---------- Build sparse matrices ----------
# ratings pivot (users x items)
rows = rates['user_id'].map(uid2idx)
cols = rates['video_id'].map(iid2idx)
data = rates['rating_score'].astype(float)
n_users, n_items = len(users), len(items)
R = sparse.csr_matrix((data, (rows, cols)), shape=(n_users, n_items))

# Helpers
videos_tags = videos.set_index('video_id')['specialities'].fillna('')
def split_tags(s):
    return [t.strip().lower() for t in str(s).split('|') if t.strip()]
videos_taglist = videos['video_id'].map(videos_tags).apply(split_tags)
vid2tags = dict(zip(videos['video_id'], videos_taglist))

behav_click = behav[behav['action_type']=='click'][['user_id','video_id']].copy()
behav_watch = behav[behav['action_type']=='watch'][['user_id','video_id']].copy()

# ---------- UserCF (cosine on users) ----------
# normalize user vectors
R_user_norm = normalize(R, norm='l2', axis=1, copy=True)

def usercf_topk_for_user(user_id, K=10):
    if user_id not in uid2idx:
        return pd.DataFrame(columns=['user_id','video_id','score'])
    uidx = uid2idx[user_id]
    uvec = R_user_norm[uidx]  # 1 x n_items
    # user similarities: all_users x 1 = R_user_norm * uvec.T
    sims = R_user_norm.dot(uvec.T).toarray().ravel()
    sims[uidx] = 0.0  # remove self
    # scores for items: sims^T * R  -> (1 x users) * (users x items)
    scores = sims @ R.toarray()  # convert small? n_users x n_items might be big; try sparse
    # To avoid memory blow, compute via sparse multiplication
    # Fallback: if toarray caused mem issue, compute with sparse
    # (we keep it; dataset seems small enough)
    # mask seen items
    seen = R[uidx].indices
    scores[seen] = -np.inf
    # top-K
    top_idx = np.argpartition(scores, -K)[-K:]
    top_idx = top_idx[np.argsort(scores[top_idx])][::-1]
    vids = [idx2iid[i] for i in top_idx]
    scs = scores[top_idx]
    return pd.DataFrame({'user_id': user_id, 'video_id': vids, 'score': scs})

# ---------- ItemCF (cosine on items) ----------
# item-user matrix
M = R.T.tocsr()  # n_items x n_users
M_norm = normalize(M, norm='l2', axis=1, copy=True)

def itemcf_topk_for_user(user_id, K=10):
    if user_id not in uid2idx:
        return pd.DataFrame(columns=['user_id','video_id','score'])
    uidx = uid2idx[user_id]
    # items rated by user and their ratings
    start, end = R.indptr[uidx], R.indptr[uidx+1]
    rated_item_idx = R.indices[start:end]
    rated_scores = R.data[start:end]
    if len(rated_item_idx)==0:
        return pd.DataFrame(columns=['user_id','video_id','score'])

    # accumulate similarity * rating via sparse ops
    # For each rated item j: v = M_norm[j] * M_norm.T (1 x items), score += r_uj * v
    # Efficiently: stack rows then do weighted sum
    rows_stack = M_norm[rated_item_idx]        # k x n_users
    # item-item sims = rows_stack * M_norm.T => k x n_items
    sims = rows_stack @ M_norm.T               # sparse x sparse -> sparse
    # weight by user ratings
    # convert ratings to column vector and do weighted sum over axis 0
    weights = sparse.csr_matrix(rated_scores.reshape(-1,1))  # k x 1
    scores_matrix = weights.T @ sims   # 1 x n_items
    scores = np.asarray(scores_matrix.todense()).ravel()

    # remove seen items
    scores[rated_item_idx] = -np.inf

    # top-K
    top_idx = np.argpartition(scores, -K)[-K:]
    top_idx = top_idx[np.argsort(scores[top_idx])][::-1]
    vids = [idx2iid[i] for i in top_idx]
    scs = scores[top_idx]
    return pd.DataFrame({'user_id': user_id, 'video_id': vids, 'score': scs})

# ---------- Batch generate Top-10 for eval_users ----------
def batch_recommend(generator_func, name, K=10, limit=None):
    out = []
    count = 0
    for uid in eval_users:
        df = generator_func(uid, K=K)
        if not df.empty:
            out.append(df[['user_id','video_id','score']].assign(rank=np.arange(1, len(df)+1)))
        count += 1
        if limit and count>=limit:
            break
    if out:
        res = pd.concat(out, ignore_index=True)
        res['model'] = name
    else:
        res = pd.DataFrame(columns=['user_id','video_id','score','rank','model'])
    return res

usercf_recs = batch_recommend(usercf_topk_for_user, "UserCF", K=10)
itemcf_recs = batch_recommend(itemcf_topk_for_user, "ItemCF", K=10)

# Map Hybrid to same schema
hybrid_small = recs_hybrid[['user_id','video_id','hybrid_score','rank']].copy()
hybrid_small = hybrid_small.rename(columns={'hybrid_score':'score'})
hybrid_small['model'] = 'Hybrid'

# ---------- Metrics ----------
def split_tags(s):
    return [t.strip().lower() for t in str(s).split('|') if t.strip()]

videos_tagmap = videos.set_index('video_id')['specialities'].fillna('').map(split_tags).to_dict()

def metrics_for(recs_df, behav_click, behav_watch, videos_tagmap, catalog_size):
    if recs_df.empty:
        return {
            "coverage": 0.0, "global_tag_entropy": np.nan,
            "avg_unique_tags_per_user": np.nan, "avg_intra_list_jaccard": np.nan,
            "ctr": np.nan, "watch_rate": np.nan, "rows": 0
        }
    # coverage
    coverage = recs_df['video_id'].nunique() / catalog_size

    # diversity
    recs_df = recs_df.copy()
    recs_df['tags'] = recs_df['video_id'].map(videos_tagmap).apply(lambda x: x if isinstance(x, list) else [])
    all_tags = [t for ts in recs_df['tags'] for t in ts]
    tag_counts = Counter(all_tags)
    probs = np.array(list(tag_counts.values()), dtype=float)
    probs = probs / probs.sum() if probs.sum()>0 else probs
    global_tag_entropy = float(-(probs * np.log(probs + 1e-12)).sum()) if probs.size>0 else np.nan

    # per-user diversity
    def user_div(df):
        tags_list = df['tags'].tolist()
        uniq = set()
        for ts in tags_list:
            uniq |= set(ts)
        # jaccard
        sims = []
        for i in range(len(tags_list)):
            for j in range(i+1, len(tags_list)):
                a, b = set(tags_list[i]), set(tags_list[j])
                if len(a)==0 and len(b)==0:
                    sims.append(1.0)
                else:
                    sims.append(len(a & b) / (len(a | b)) if len(a | b)>0 else 0.0)
        return pd.Series({'unique_tags': len(uniq), 'avg_jaccard': np.mean(sims) if sims else np.nan})
    user_div_df = recs_df.groupby('user_id').apply(user_div).reset_index()
    avg_unique_tags = float(user_div_df['unique_tags'].mean())
    avg_jaccard = float(user_div_df['avg_jaccard'].mean())

    # CTR / Watch
    key = recs_df[['user_id','video_id']].copy().drop_duplicates()
    clicked = key.merge(behav_click, on=['user_id','video_id'], how='left', indicator=True)
    watched = key.merge(behav_watch, on=['user_id','video_id'], how='left', indicator=True)
    ctr = (clicked['_merge']=='both').mean()
    watch_rate = (watched['_merge']=='both').mean()

    return {
        "coverage": float(coverage),
        "global_tag_entropy": global_tag_entropy,
        "avg_unique_tags_per_user": avg_unique_tags,
        "avg_intra_list_jaccard": avg_jaccard,
        "ctr": float(ctr),
        "watch_rate": float(watch_rate),
        "rows": int(len(recs_df))
    }

catalog_size = videos['video_id'].nunique()

m_hybrid = metrics_for(hybrid_small, behav_click, behav_watch, videos_tagmap, catalog_size)
m_usercf = metrics_for(usercf_recs, behav_click, behav_watch, videos_tagmap, catalog_size)
m_itemcf = metrics_for(itemcf_recs, behav_click, behav_watch, videos_tagmap, catalog_size)

pd.DataFrame([
    {"model":"Hybrid", **m_hybrid},
    {"model":"UserCF", **m_usercf},
    {"model":"ItemCF", **m_itemcf},
])


  user_div_df = recs_df.groupby('user_id').apply(user_div).reset_index()
  user_div_df = recs_df.groupby('user_id').apply(user_div).reset_index()
  user_div_df = recs_df.groupby('user_id').apply(user_div).reset_index()


Unnamed: 0,model,coverage,global_tag_entropy,avg_unique_tags_per_user,avg_intra_list_jaccard,ctr,watch_rate,rows
0,Hybrid,0.3974,2.729675,9.779221,0.387959,0.005395,0.251997,10010
1,UserCF,0.3445,2.771416,13.095,0.1,0.0061,0.0032,10000
2,ItemCF,0.3953,2.771716,13.079,0.101075,0.006,0.0014,10000
