# Import necessary libraries

In [1]:
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
from tqdm import tqdm

2025-05-17 22:09:57.856061: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare environment

In [2]:
FEATURES_PATH = "data/features/"

# Load Data

In [3]:
model = load_model(FEATURES_PATH + "ai_model.keras", safe_mode=False)
small_matrix_final = pd.read_parquet(FEATURES_PATH + "small_matrix_scaled.parquet")

# Check Data

In [4]:
small_matrix_final

Unnamed: 0,user_id,video_id,watch_ratio,video_duration,user_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,14,148,0.722103,-0.787354,0.558826,1.061715,0.735789,1.662743
1,14,183,1.907377,-0.779625,0.558826,1.285246,-0.122518,-0.073836
2,14,3649,2.063311,0.336768,0.558826,-0.196588,-0.765165,-0.073836
3,14,5262,0.566388,-0.356206,0.558826,0.452844,-0.270903,1.140845
4,14,8234,0.418364,0.367916,0.558826,-0.221446,-0.175920,0.686910
...,...,...,...,...,...,...,...,...
3830597,7162,9177,0.142857,6.504918,0.716789,-1.796737,1.274611,1.136875
3830598,7162,4987,1.234848,-0.295550,0.716789,0.446340,-0.002456,-0.230581
3830599,7162,7988,1.024412,9.341920,0.716789,-1.840603,1.274611,1.136875
3830600,7162,6533,0.273750,-0.334660,0.716789,0.344813,-0.002456,-0.230581


# Recommender system

### Define function to get recomendations of a given user

In [5]:
def get_user_recommendation_data(user: int):
    preds = model.predict(small_matrix_final[small_matrix_final["user_id"] == user][[col for col in small_matrix_final.columns if col not in ["watch_ratio", "user_id", "video_id"]]], batch_size=2048, verbose=None).flatten()
    user_mat = small_matrix_final[small_matrix_final["user_id"] == user].reset_index(drop=True)
    to_recommend_index = np.flip(preds.argsort())
    to_recommend = user_mat["video_id"][to_recommend_index].values

    ground_truth_index = np.flip(user_mat["watch_ratio"].values.argsort())
    ground_truth = user_mat["video_id"][ground_truth_index].values

    return to_recommend, ground_truth

### Predict top 10 videos to recommend to some user

In [6]:
sample_user = 946
preds, ground_truth = get_user_recommendation_data(sample_user)
print(f"predictions: {preds[:10]}")
print(f"ground_truth: {ground_truth[:10]}")

predictions: [4040  600  314 2263 8298 3713 3686 9013 6868 2130]
ground_truth: [6282 9178 8248 5638 7799 9683 6970 1314 7253  557]


### Calculate ndcg score for this user

In [7]:
ndcg_score([ground_truth], [preds], k=10)

np.float64(0.4684393248800202)

# Evaluating metrics

### Evaluate mean NDCG metric on all users
We are using NDCG@K because it is a metrics that looks into the ranking of the predictions

In [8]:
ndcg_5_list = []
for user_id in tqdm(small_matrix_final["user_id"].unique()):
    user_df = small_matrix_final[small_matrix_final["user_id"] == user_id]
    recommendations, ground_truth = get_user_recommendation_data(user_id)
    
    if len(ground_truth) >= 5:
        ndcg_5_list.append(ndcg_score([ground_truth], [recommendations], k=10))

avg_ndcg_5 = np.mean(ndcg_5_list)
print(f"Average NDCG@5 over users: {avg_ndcg_5:.4f}")

  0%|          | 0/1354 [00:00<?, ?it/s]

100%|██████████| 1354/1354 [01:10<00:00, 19.17it/s]

Average NDCG@5 over users: 0.4717





### Create function to predict recommendations using the video's average watch_ratio as baseline

In [9]:
def get_user_recommendation_data_baseline(user: int):
    preds = np.array(small_matrix_final[small_matrix_final["user_id"] == user]["video_watch_ratio_mean"].to_list())
    user_mat = small_matrix_final[small_matrix_final["user_id"] == user].reset_index(drop=True)
    to_recommend_index = np.flip(preds.argsort())
    to_recommend = user_mat["video_id"][to_recommend_index].values

    ground_truth_index = np.flip(user_mat["watch_ratio"].values.argsort())
    ground_truth = user_mat["video_id"][ground_truth_index].values

    return to_recommend, ground_truth

### Check the NDCG score using our baseline

In [10]:
ndcg_5_list = []
for user_id in tqdm(small_matrix_final["user_id"].unique()):
    user_df = small_matrix_final[small_matrix_final["user_id"] == user_id]
    recommendations, ground_truth = get_user_recommendation_data_baseline(user_id)
    
    if len(ground_truth) >= 5:
        ndcg_5_list.append(ndcg_score([ground_truth], [recommendations], k=10))

avg_ndcg_5 = np.mean(ndcg_5_list)
print(f"Average NDCG@5 over users: {avg_ndcg_5:.4f}")

  0%|          | 0/1354 [00:00<?, ?it/s]

100%|██████████| 1354/1354 [00:10<00:00, 130.02it/s]

Average NDCG@5 over users: 0.4751





As we can see, our ai model is approximately on the same level as our baseline, which means that it didn't succeed in learning complex correlations between the different data.\
Unfortunately, because of my computer's limitations, I wasn't able to include all desired features into this model. Maybe further research and data analysis could have lead to a more optimized approach.

# Looking into other acceptable metrics

As we saw before, our model isn't really efficient.\
However, the ranking of every single video isn't really the most important part of a video recommender system such as Kuaishou or Tiktok.\
In fact, the important part is to be able to recommend good videos, and avoid bad ones.

Having this in mind, we can group the videos with similar watch ratios and ignore the exact ranks between videos of similar rank.\
To achieve this, we will round the predicted watch ratio, leaving us with a watch ratio belonging to this set: {0, 1, 2, 3, 4, 5}

While I know that this has limitations (like the fact that a watch ratio of 1.49 and 1.51 will be considered very differently) and that this method doesn't improve our ai model, it is still an acceptable metric.

In the following code, the videos will be sorted by rounded watch ratio and then by their video id. This last part is to be able to evaluate its ranking with our ground_truth.\
In reality, we could recommend any video within the best watch ratio predicted

In [11]:
def get_user_recommendation_data_rounded(user: int):
    user_mat = small_matrix_final[small_matrix_final["user_id"] == user].reset_index(drop=True)
    preds = model.predict(user_mat[[col for col in small_matrix_final.columns if col not in ["watch_ratio", "user_id", "video_id"]]], batch_size=2048, verbose=None).flatten()

    user_mat["preds_rounded"] = pd.Series(np.around(preds))
    user_mat["watch_ratio_rounded"] = pd.Series(np.around(user_mat["watch_ratio"].to_numpy()))

    to_recommend = user_mat.sort_values(["preds_rounded", "video_id"])["video_id"]
    ground_truth = user_mat.sort_values(["watch_ratio_rounded", "video_id"])["video_id"]

    return to_recommend, ground_truth

### Evaluate mean NDCG metric on all users

In [12]:
ndcg_5_list = []
for user_id in tqdm(small_matrix_final["user_id"].unique()):
    user_df = small_matrix_final[small_matrix_final["user_id"] == user_id]
    recommendations, ground_truth = get_user_recommendation_data_rounded(user_id)
    
    if len(ground_truth) >= 5:
        ndcg_5_list.append(ndcg_score([ground_truth], [recommendations], k=10))

avg_ndcg_5 = np.mean(ndcg_5_list)
print(f"Average NDCG@5 over users: {avg_ndcg_5:.4f}")

  0%|          | 0/1354 [00:00<?, ?it/s]

100%|██████████| 1354/1354 [01:11<00:00, 18.90it/s]

Average NDCG@5 over users: 0.6127





This improved result tells us that our ai model, while not being able to rank every single video efficiently enough, is able to somewhat tell apart good videos from bad videos.

In [13]:
def get_user_recommendation_data_baseline_bis(user: int):
    user_mat = small_matrix_final[small_matrix_final["user_id"] == user].reset_index(drop=True)
    preds = np.array(user_mat["video_watch_ratio_mean"].to_list())
    
    user_mat["preds_rounded"] = pd.Series(np.around(preds))
    user_mat["watch_ratio_rounded"] = pd.Series(np.around(user_mat["watch_ratio"].to_numpy()))
    
    to_recommend = user_mat.sort_values(["preds_rounded", "video_id"])["video_id"]
    ground_truth = user_mat.sort_values(["watch_ratio_rounded", "video_id"])["video_id"]

    return to_recommend, ground_truth

### Check the NDCG score using our baseline

In [14]:
ndcg_5_list = []
for user_id in tqdm(small_matrix_final["user_id"].unique()):
    user_df = small_matrix_final[small_matrix_final["user_id"] == user_id]
    recommendations, ground_truth = get_user_recommendation_data_baseline_bis(user_id)
    
    if len(ground_truth) >= 5:
        ndcg_5_list.append(ndcg_score([ground_truth], [recommendations], k=10))

avg_ndcg_5 = np.mean(ndcg_5_list)
print(f"Average NDCG@5 over users: {avg_ndcg_5:.4f}")

  0%|          | 0/1354 [00:00<?, ?it/s]

100%|██████████| 1354/1354 [00:10<00:00, 131.45it/s]

Average NDCG@5 over users: 0.6073





Seing as our baseline (using the video's global average watch_ratio) is still quite close to our model's metrics, we can conclude that ou ai model wasn't able to get a good grasp of a user's specific feature and is only based on the video's global popularity.