# Recommender Sytem

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

We first load all the data we need for the system (check **data_analysis.ipynb** for more details).

In [None]:
DATASET_PATH = "dataset"

def user_feats():
    user_columns = [
        "user_id", "onehot_feat0", "onehot_feat1", "onehot_feat2", "onehot_feat3", "onehot_feat4", "onehot_feat5",
        "onehot_feat6", "onehot_feat7", "onehot_feat8", "onehot_feat9", "onehot_feat10", "onehot_feat11",
        "onehot_feat12", "onehot_feat13", "onehot_feat14", "onehot_feat15", "onehot_feat16", "onehot_feat17"
    ]

    users = pd.read_csv(f"{DATASET_PATH}/user_features.csv", usecols=user_columns)

    users = users.fillna(0).astype(float)
    users['user_id'] = users['user_id'].astype(int)
    return users

def item_feats():
    item_cols = [
        'video_id', 'music_id', 'video_tag_id', 'play_cnt',
        'play_duration', 'long_time_play_cnt', 'short_time_play_cnt',
        'play_progress', 'comment_stay_duration', 'like_cnt',
        'cancel_like_cnt', 'comment_cnt', 'delete_comment_cnt',
        'comment_like_cnt', 'follow_cnt', 'cancel_follow_cnt',
        'share_cnt', 'download_cnt'
    ]
    
    cols_to_aggregate = [
        'play_cnt', 'play_duration', 'long_time_play_cnt',
        'short_time_play_cnt', 'play_progress', 'comment_stay_duration',
        'like_cnt', 'cancel_like_cnt', 'comment_cnt',
        'delete_comment_cnt', 'comment_like_cnt', 'follow_cnt',
        'cancel_follow_cnt', 'share_cnt', 'download_cnt'
    ]
    
    items = pd.read_csv(f"{DATASET_PATH}/item_daily_features.csv", usecols=item_cols)
    

    agg_items = items.groupby('video_id').agg({
        col: 'mean' for col in cols_to_aggregate
    }).reset_index()
    
    last_items = items.sort_values('video_id').drop_duplicates(
        subset="video_id", keep="last"
    )[['video_id', 'music_id', 'video_tag_id']]
    
    result = pd.merge(last_items, agg_items, on='video_id')
    return result

def truth_data():
    y_cols = ['user_id', 'video_id', 'watch_ratio']
    y = pd.read_csv(f"{DATASET_PATH}/big_matrix.csv", usecols=y_cols)
    y['watch_ratio'] = y['watch_ratio'].clip(upper=5)
    return y

user_df = user_feats()
item_df = item_feats()
y_df = truth_data()

We then process the data to only keep the meaningful values in numpy arrays.

In [4]:
def load_data():
    merged_user = pd.merge(
        y_df, 
        user_df, 
        on='user_id', 
        how='left'
    )
    
    # Extract user features (all columns except the ones from y_df)
    user_cols = [col for col in merged_user.columns if col not in y_df.columns or col == 'user_id']

    # Merge item features
    final_df = pd.merge(
        merged_user,
        item_df,
        left_on='video_id',
        right_on='video_id',
        how='left'
    )
    
    # Get the result column
    y_data = final_df['watch_ratio'].values
    
    # Extract user features (excluding the ID columns and watch_ratio)
    user_data = final_df[user_cols].drop('user_id', axis=1).values
    
    # Extract item features (all columns from item_df except video_id)
    item_cols = [col for col in item_df.columns if col != 'video_id']
    item_data = final_df[item_cols].values
    
    return user_data, item_data, y_data

user_train, item_train, y_train = load_data()


Once the data is loaded, we can split it into **training** data and **testing** data.

In [5]:
split_size = 0.2
user_train, user_test, item_train, item_test, y_train, y_test  = train_test_split(user_train, item_train, y_train, test_size=split_size)

num_user_features = len(user_train[0])
num_item_features = len(item_train[0])

## Scaling the data

To scale the data, we will use two scaler:
- **StandardScaler** for the users and the items
- **MinMaxScaler** for the truth values

We first fit the scaler with the train data, then we transform both the train and test data with that scaler.

In [6]:
userScaler = StandardScaler()
user_train = userScaler.fit_transform(user_train)
user_test = userScaler.transform(user_test)

itemScaler = StandardScaler()
item_train = itemScaler.fit_transform(item_train)
item_test = itemScaler.transform(item_test)

yScaler = MinMaxScaler((-1, 1))
y_train = yScaler.fit_transform(y_train.reshape(-1, 1))
y_test = yScaler.transform(y_test.reshape(-1, 1))

## Recommender System: Content Based Filtering using Keras

For this project, I decided to use **Content Based Filtering** since we do not have relation between users. Moreover, this approach better handles the **cold start** for new users and new items.

I used a neural network with Keras as recommender system, as seen in 8th course.

Note: if the model was already trained, you can just load it using the following command.

In [6]:
# Load the model.
model = keras.models.load_model('mymodel.keras')

2025-05-17 19:16:10.873081: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-17 19:16:10.905407: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [7]:
num_outputs = 48
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(96, activation='relu'),
        tf.keras.layers.Dense(num_outputs)
    ]
)

item_NN = tf.keras.models.Sequential(
    [
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(96, activation='relu'),
        tf.keras.layers.Dense(num_outputs)
    ]
)

# create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

2025-05-17 20:17:01.995634: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-05-17 20:17:02.157425: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


This neural network will perform **regression** to give a watch ratio to all user/video pair. For such models, the best metrics are:
- **Mean Absolute Error (MAE)**: express how much the predictions are off
- **Root Mean Squared Error (RMSE)**: express how much the predictions are off, but penalizes larger errors more

The goal is to minimize both of these metrics.

In [8]:
cost_fn = tf.keras.losses.MeanSquaredError()
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn, metrics=[keras.metrics.MeanAbsoluteError(), keras.metrics.RootMeanSquaredError()])

model.fit(
    [user_train, item_train],
    y_train,
    epochs=2,
    batch_size=512,
    validation_data=([user_test, item_test], y_test)
)

# Save the model.
model.save("mymodel.keras")

Epoch 1/2
Epoch 2/2


In [11]:
loss, rae, rmse = model.evaluate([user_test, item_test], y_test)
print("Testing loss:", loss)
print("RAE:", rae)
print("RMSE:", rmse)

Testing loss: 0.08589141070842743
RAE: 0.1874883621931076
RMSE: 0.293072372674942


The testing loss is comparable to the training loss indicating the model has not substantially overfit the training data.

Moreover, the RAE and RMSE are both low, meaning that the system predicted pretty well the watch_ratio for each user/video pair.

## Predict videos to an user

For this task, we need as input parameter the id of our user and the number of videos we want to recommend. This function will then print to the terminal the top video ids recommended to our user.

In [9]:
# Choose a random user
user_id = 1234

def predict_existing_user(user_id, num_items=None, get_score=True):
    # Extract the user feature vector excluding the user_id column and multiply it by the number of items
    user_vecs = user_df.iloc[user_id, 1:].values.reshape(1, -1)
    user_vecs = np.tile(user_vecs, (len(item_df), 1))

    # Extract the items feature vector excluding the video_id column
    item_vecs = item_df.iloc[:, 1:].to_numpy()

    # Scale the data using our previous scaler
    user_scaled = userScaler.transform(user_vecs)
    item_scaled = itemScaler.transform(item_vecs)

    # Predict the videos
    y_p = model.predict([user_scaled, item_scaled])
    y_pu = yScaler.inverse_transform(y_p)

    # Sort them to get the most recommended videos
    sorted_index = (
        np.argsort(-y_pu, axis=0).reshape(-1).tolist()
    )

    sorted_items = item_vecs[sorted_index]
    if num_items != None:
        sorted_items = sorted_items[:num_items]
    video_ids = [np.where((item_vecs == el).all(axis=1))[0][0] for el in sorted_items]

    if get_score:
        sorted_ypu = y_pu[sorted_index]
        if num_items != None:
            sorted_ypu = sorted_ypu[:num_items]
        return video_ids, sorted_ypu
    else:
        return video_ids

video_ids, sorted_ypu = predict_existing_user(user_id=user_id, num_items=10)

# Display each video_id with its watch ratio
for i in range(len(video_ids)):
    print("Predict video id", video_ids[i], "with watch ratio", sorted_ypu[i][0])

Predict video id 4546 with watch ratio 2.1708658
Predict video id 314 with watch ratio 2.1316507
Predict video id 908 with watch ratio 2.0876496
Predict video id 10318 with watch ratio 2.0545914
Predict video id 5162 with watch ratio 2.053305
Predict video id 9314 with watch ratio 2.01308
Predict video id 10720 with watch ratio 1.9907583
Predict video id 1946 with watch ratio 1.9508303
Predict video id 498 with watch ratio 1.9384815
Predict video id 1305 with watch ratio 1.92878


We now can evaluate how well does the recommendation system perform. For this, we will calculate the following metrics:

### 🔹 Precision@K and Recall@K

$$
\text{Precision@K} = \frac{\text{number of relevant recommended items in top-K}}{K}
$$

$$
\text{Recall@K} = \frac{\text{number of relevant recommended items in top-K}}{\text{total number of relevant items}}
$$

### 🔹 Normalised discount cumulative gain (NDCG)

$$
\text{NDCG@K} = \frac{\text{DCG@K}}{\text{IDCG@K}}
$$

where, 

$$
\text{DCG@K} = \sum_{i=1}^{K} \frac{2^{rel_i} - 1}{\log_2(i+1)}
$$
where, $rel_i$ is the ground truth relevance score of the $i^{th}$ item in the top-K list.

In [10]:
def precision_at_k(ranked_items, ground_truth, k):
    hits = sum([1 for item in ranked_items[:k] if item in ground_truth])
    return hits / k

def recall_at_k(ranked_items, ground_truth, k):
    hits = sum([1 for item in ranked_items[:k] if item in ground_truth])
    return hits / len(ground_truth)

def ndcg_at_k(ranked_items, ground_truth, k):
    dcg = 0.0
    for i, item in enumerate(ranked_items[:k]):
        if item in ground_truth:
            dcg += 1.0 / np.log2(i + 2)
    idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(ground_truth), k))])
    return dcg / idcg if idcg > 0 else 0.0

We suppose that a video is relevant to a user if the watch_ratio is 1 or more.

In [18]:
# Another random user
user_id2 = 76

relevant_user = y_df[y_df['user_id'] == user_id2]
relevant_user = relevant_user[relevant_user['watch_ratio'] >= 1]
relevant_videos = list(relevant_user['video_id'])

recommended_videos = predict_existing_user(user_id2, get_score=False)

for i in range(4):
    i_pow = 10 ** i
    print(f"Precision@{i_pow} for user {user_id2} is", precision_at_k(recommended_videos, relevant_videos, i_pow))
    print(f"Recall@{i_pow} for user {user_id2} is", recall_at_k(recommended_videos, relevant_videos, i_pow))
    print(f"NDCG@{i_pow} for user {user_id2} is", ndcg_at_k(recommended_videos, relevant_videos, i_pow))
    print()

Precision@1 for user 76 is 1.0
Recall@1 for user 76 is 0.0008920606601248885
NDCG@1 for user 76 is 1.0

Precision@10 for user 76 is 0.3
Recall@10 for user 76 is 0.0026761819803746653
NDCG@10 for user 76 is 0.42257499837058643

Precision@100 for user 76 is 0.31
Recall@100 for user 76 is 0.027653880463871544
NDCG@100 for user 76 is 0.3338218276600255

Precision@1000 for user 76 is 0.136
Recall@1000 for user 76 is 0.12132024977698483
NDCG@1000 for user 76 is 0.15556792354785912



Precision@1 is 1 here, meaning that I have no problem with the cold start. It is however not the case for all users.

Globally those metrics are ok, and users get 1 relevant video each 3-4 recommended. The recommender system seem to not be perfect at ranking items, but the issue is also that there are a lot of items and each user has not interacted with many of them.