# Import necessary libraries

In [1]:
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
from tqdm import tqdm

2025-05-15 03:09:30.697383: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare environment

In [2]:
FEATURES_PATH = "data/features/"

# Load Data

In [3]:
model = load_model(FEATURES_PATH + "ai_model.keras", safe_mode=False)
small_matrix_final = pd.read_parquet(FEATURES_PATH + "small_matrix_final.parquet")
small_matrix_final = small_matrix_final.drop(columns=["like"])

In [4]:
small_matrix_final

Unnamed: 0,user_id,video_id,watch_ratio,video_length,user_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,14,148,0.722103,-1,1.051964,1.274520,1.169903,1.515494
1,14,183,1.907377,-1,1.051964,1.318898,0.936984,0.962073
2,14,3649,2.063311,0,1.051964,0.784641,0.715448,0.962073
3,14,5262,0.566388,0,1.051964,0.992569,0.840792,1.245391
4,14,8234,0.418364,0,1.051964,0.781283,0.921278,1.204511
...,...,...,...,...,...,...,...,...
3857008,7162,9177,0.142857,1,1.106798,0.219228,1.347908,1.347908
3857009,7162,4987,1.234848,0,1.106798,1.109515,0.972297,0.912121
3857010,7162,7988,1.024412,1,1.106798,0.202936,1.347908,1.347908
3857011,7162,6533,0.273750,0,1.106798,1.062392,0.972297,0.912121


# Recommender system

### Define function to get recomendations of a given user

In [5]:
def get_user_recommendation_data(user: int):
    preds = model.predict(small_matrix_final[small_matrix_final["user_id"] == user][[col for col in small_matrix_final.columns if col not in ["watch_ratio", "user_id", "video_id"]]], batch_size=2048, verbose=None).flatten()
    user_mat = small_matrix_final[small_matrix_final["user_id"] == user].reset_index(drop=True)
    to_recommend_index = np.flip(preds.argsort())
    to_recommend = user_mat["video_id"][to_recommend_index].values

    ground_truth_index = np.flip(user_mat["watch_ratio"].values.argsort())
    ground_truth = user_mat["video_id"][ground_truth_index].values

    return to_recommend, ground_truth

### Predict top 10 videos to recommend to some user

In [6]:
sample_user = 6190
preds, ground_truth = get_user_recommendation_data(sample_user)
print(f"predictions: {preds[:10]}")
print(f"ground_truth: {ground_truth[:10]}")

predictions: [ 4040   600  8366  2263  2130  9821  1305  5525 10206  6868]
ground_truth: [ 2607  5222  7079  2436  1288  7181  4646 10091  9569  1182]


# Evaluating metrics

### Define metrics function

In [7]:
def precision_at_k(y_true, y_pred, k):
    y_true_k = y_true[:k]
    y_pred_k = y_pred[:k]
    return len(set(y_true_k) & set(y_pred_k)) / k

def recall_at_k(y_true, y_pred, k):
    y_pred_k = y_pred[:k]
    return len(set(y_true) & set(y_pred_k)) / len(y_true) if y_true is not None else 0

def average_precision_at_k_continuous(actual_scores, predicted_scores, k, top_n_relevant=5):
    """
    Compute AP@k based on ranking from predicted scores.
    Top-N videos with highest actual scores are considered relevant.
    """
    if len(actual_scores) == 0:
        return 0.0

    # Get the indices of top-N most relevant items based on actual scores
    actual_top_indices = np.argsort(actual_scores)[::-1][:top_n_relevant]
    relevant_set = set(actual_top_indices)

    # Get the indices of top-k predicted scores
    pred_top_indices = np.argsort(predicted_scores)[::-1][:k]

    hits = 0
    score = 0.0
    for i, idx in enumerate(pred_top_indices):
        if idx in relevant_set:
            hits += 1
            score += hits / (i + 1)

    return score / min(len(relevant_set), k)

### Evaluate mean NDCG metric on all users
We are using NDCG because it is a metrics that look into the ranking of the predictions

In [8]:
from tqdm import tqdm
ndcg_5_list = []
for user_id in tqdm(small_matrix_final["user_id"].unique()):
    user_df = small_matrix_final[small_matrix_final["user_id"] == user_id]
    recommendations, ground_truth = get_user_recommendation_data(user_id)
    
    if len(ground_truth) >= 5:
        ndcg_5_list.append(ndcg_score([ground_truth], [recommendations], k=5))

avg_ndcg_5 = np.mean(ndcg_5_list)
print(f"Average NDCG@5 over users: {avg_ndcg_5:.4f}")

100%|██████████| 1354/1354 [01:12<00:00, 18.65it/s]

Average NDCG@5 over users: 0.4741





In [9]:
small_matrix_final.shape

(3857013, 8)

In [10]:
def get_user_recommendation_data_baseline(user: int):
    preds = np.array(small_matrix_final[small_matrix_final["user_id"] == user]["video_watch_ratio_mean"].to_list())
    user_mat = small_matrix_final[small_matrix_final["user_id"] == user].reset_index(drop=True)
    to_recommend_index = np.flip(preds.argsort())
    to_recommend = user_mat["video_id"][to_recommend_index].values

    ground_truth_index = np.flip(user_mat["watch_ratio"].values.argsort())
    ground_truth = user_mat["video_id"][ground_truth_index].values

    return to_recommend, ground_truth

In [11]:
ndcg_5_list = []
for user_id in tqdm(small_matrix_final["user_id"].unique()):
    user_df = small_matrix_final[small_matrix_final["user_id"] == user_id]
    recommendations, ground_truth = get_user_recommendation_data_baseline(user_id)
    
    if len(ground_truth) >= 5:
        ndcg_5_list.append(ndcg_score([ground_truth], [recommendations], k=5))

avg_ndcg_5 = np.mean(ndcg_5_list)
print(f"Average NDCG@5 over users: {avg_ndcg_5:.4f}")

  0%|          | 0/1354 [00:00<?, ?it/s]

100%|██████████| 1354/1354 [00:11<00:00, 115.54it/s]

Average NDCG@5 over users: 0.4728





In [13]:
small_matrix_final["video_watch_ratio_mean"].to_list()

[1.274520001801421,
 1.3188978255149304,
 0.7846410164013661,
 0.9925691250912209,
 0.7812832148726736,
 0.6529496043378484,
 0.977059994449473,
 0.19273905683389728,
 1.0642026359953094,
 1.438426002360062,
 1.2356270633411266,
 0.7842695017623357,
 1.8630375603577478,
 0.8138044118001272,
 1.2268634145210533,
 1.1292787075834294,
 0.7886635664621502,
 0.871354848766887,
 0.578444768062364,
 1.295012045147197,
 1.1209628707726949,
 0.6771345191391295,
 0.767749152707678,
 0.9414299375302152,
 1.0824613052047911,
 0.7992592534515321,
 0.9110211770742874,
 0.8979425450361856,
 1.5284373243903737,
 1.1277948570863334,
 0.9330323736919851,
 0.8362922380255178,
 1.278597078080528,
 0.784182884964128,
 1.0623292399172506,
 1.4419973449919148,
 1.0328285059363913,
 0.8464374389446042,
 0.7772501143641354,
 0.1629152522650488,
 1.365771302917687,
 0.16728299694208248,
 0.2050695772473206,
 0.6873984154666016,
 0.8140883991513244,
 1.3073943313542549,
 1.1204418028727092,
 0.8145459957518592,
