# Load data

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer

# Load the data
big_df    = pd.read_csv('data/big_matrix.csv')
small_df  = pd.read_csv('data/small_matrix.csv')
item_cat  = pd.read_csv('data/item_categories.csv')
user_feat = pd.read_csv('data/user_features.csv')
item_daily_features = pd.read_csv('data/item_daily_features.csv')


## Feature Engineering

In [3]:
from sklearn.linear_model import LinearRegression
pop = item_daily_features.groupby("video_id")[[
    'show_cnt', 'play_cnt', 'like_user_num', 'share_cnt', 'comment_cnt',
]].sum()

video_watch_ratio = big_df.groupby("video_id")['watch_ratio'].mean()
pop = pop.join(video_watch_ratio, on="video_id",how="right")
X = pop[['show_cnt', 'play_cnt', 'like_user_num', 'share_cnt', 'comment_cnt']].fillna(0)
y = pop['watch_ratio'].fillna(0)

model = LinearRegression()
model.fit(X, y)

coeffs = pd.Series(model.coef_, index=X.columns)
print("Learned weights:")
print(coeffs.sort_values(ascending=False))
pop['popularity_score'] = model.predict(X)
pop = pop['popularity_score']
pop_df = pop.reset_index()
pop_df.columns = ['video_id', 'popularity_score']
pop_df = pop_df.dropna(subset=['video_id'])
pop_df['video_id'] = pop_df['video_id'].astype('int32')
print(pop_df['popularity_score'].describe())

Learned weights:
play_cnt         3.653623e-08
show_cnt        -5.446159e-08
like_user_num   -9.252525e-08
comment_cnt     -1.097558e-07
share_cnt       -2.289058e-06
dtype: float64
count    10728.000000
mean         1.203716
std          0.175478
min         -2.678407
25%          1.235405
50%          1.269730
75%          1.272172
max          1.276259
Name: popularity_score, dtype: float64


In [None]:
# Encode tags (0-30) → multi-hot vector
item_cat['feat'] = item_cat['feat'].apply(literal_eval)
mlb = MultiLabelBinarizer(classes=list(range(31)))
tag_matrix = mlb.fit_transform(item_cat['feat'])
tag_df = pd.DataFrame(tag_matrix, columns=[f'tag_{i}' for i in mlb.classes_])
item_cat = pd.concat([item_cat[['video_id']], tag_df], axis=1)

# Encode any string user features to integer codes
for col in user_feat.columns:
    if col != 'user_id' and user_feat[col].dtype == 'object':
        user_feat[col], _ = pd.factorize(user_feat[col])

# Merge: train on big_matrix, test on small_matrix
df_train = big_df.merge(item_cat, on='video_id').merge(user_feat, on='user_id').merge(pop_df, on='video_id')
df_test = small_df.merge(item_cat, on='video_id').merge(user_feat, on='user_id').merge(pop_df, on='video_id')

# Number of videos watched by each user
user_inter_counts = df_train.groupby("user_id")["video_id"].count()
user_inter_counts.name = "User Interactions"

# Number of times each video has been watched
video_inter_counts = df_train.groupby("video_id")["user_id"].count()
video_inter_counts.name = "Video Interactions"
big_watch_ratio = df_train.watch_ratio

df_train = df_train[
    (df_train['user_id'].isin(user_inter_counts[user_inter_counts >= 248].index)) &
    (df_train['video_id'].isin(video_inter_counts[video_inter_counts >= 1].index)) &
    (df_train['watch_ratio'] > 0) &
    (df_train['watch_ratio'] <= big_watch_ratio.quantile(0.75))
]

df_train['watch_ratio_log'] = np.log1p(df_train['watch_ratio'])
df_test['watch_ratio_log'] = np.log1p(df_test['watch_ratio'])


# Build tag_multi_hot list column and cast types
tag_cols = [f'tag_{i}' for i in range(31)]
for df in (df_train, df_test):
    df['tag_multi_hot'] = df[tag_cols].values.tolist()
    df.fillna(0, inplace=True)
    df['user_id'] = df['user_id'].astype('int32')
    df['video_id'] = df['video_id'].astype('int32')
    for col in user_feat.columns:
        if col != 'user_id':
            df[col] = df[col].astype('int32')

# Modèles

In [14]:
global_mean = np.log1p(df_train['watch_ratio'].mean())
class GlobalMeanBaseline:
    def __init__(self):
        self.global_mean = global_mean

    def predict(self, inputs, verbose=0):
        n = len(inputs['user_id'])
        return np.full(shape=(n,), fill_value=self.global_mean)

In [5]:
class PopularityBaselineModel:
    def __init__(self, test_df, k=10):
        self.k = k
        self.popularity_map = (
            test_df.groupby('video_id')['popularity_score']
            .mean()
            .to_dict()
        )

    def predict(self, inputs, verbose=0):
        video_ids = inputs['video_id']
        video_scores = np.array([self.popularity_map.get(vid, 0.0) for vid in video_ids])
        return video_scores

In [None]:
from collections import Counter
from sklearn.metrics import ndcg_score
import numpy as np

def evaluate_model(model, df_test, df_train, user_feat, k=10):
    item_popularity = Counter(df_train['video_id'])
    total_items = len(item_popularity)

    all_user_ids = df_test['user_id'].unique()
    mae_list = []
    rmse_list = []
    ndcg_list = []
    novelty_list = []
    tag_diversity_list = []
    avg_popularity_list = []

    for user_id in all_user_ids:
        user_df = df_test[df_test['user_id'] == user_id]
        if user_df.empty:
            continue

        true_watch_ratios = {row['video_id']: row['watch_ratio'] for _, row in user_df.iterrows()}
        if all(v == 0 for v in true_watch_ratios.values()):
            continue

        user_inputs = {
            'user_id': np.full(len(user_df), user_id, dtype='int32'),
            'video_id': user_df['video_id'].values.astype('int32'),
            'tag_multi_hot': np.array(user_df['tag_multi_hot'].tolist(), dtype='float32'),
        }

        for col in user_feat.columns:
            if col != 'user_id':
                val = user_df[col].values.astype('int32')
                user_inputs[col] = val

        preds = np.expm1(model.predict(user_inputs, verbose=0)).flatten()

        ranked_indices = np.argsort(-preds)
        ranked_items = user_df['video_id'].values[ranked_indices]
        top_k_items = ranked_items[:k]
        top_k_preds = preds[ranked_indices][:k]

        # Get the true watch ratios for the top_k items
        top_k_true = np.array([true_watch_ratios.get(item, 0.0) for item in top_k_items])

        # MAE@k et RMSE@k
        abs_errors = np.abs(top_k_true - top_k_preds)
        sq_errors = (top_k_true - top_k_preds) ** 2

        mae = np.mean(abs_errors)
        rmse = np.sqrt(np.mean(sq_errors))

        mae_list.append(mae)
        rmse_list.append(rmse)

        # NDCG@k
        relevance = [true_watch_ratios.get(item, 0.0) for item in ranked_items[:k]]
        ndcg = ndcg_score([relevance], [top_k_preds])
        ndcg_list.append(ndcg)

        # Novelty@k
        novelty = -np.mean([np.log2(item_popularity[item] / total_items + 1e-10) for item in top_k_items])
        novelty_list.append(novelty)

        # Tag diversity@k
        tag_vectors = np.array([user_df[user_df['video_id'] == item]['tag_multi_hot'].values[0] for item in top_k_items])
        tag_union = np.sum(np.any(tag_vectors, axis=0))
        tag_diversity = tag_union / tag_vectors.shape[1]
        tag_diversity_list.append(tag_diversity)

        # Popularity@k
        top_k_popularity = user_df.iloc[ranked_indices[:k]]['popularity_score'].values
        avg_popularity = np.mean(top_k_popularity)
        avg_popularity_list.append(avg_popularity)

    print(f'MAE@{k}: {np.mean(mae_list):.4f}')
    print(f'RMSE@{k}: {np.mean(rmse_list):.4f}')
    print(f'NDCG@{k}: {np.mean(ndcg_list):.4f}')
    print(f'Novelty@{k}: {np.mean(novelty_list):.4f}')
    print(f'Tag Diversity@{k}: {np.mean(tag_diversity_list):.4f}')
    print(f'Average Popularity@{k}: {np.mean(avg_popularity):.4f}')



In [None]:
pop_model = PopularityBaselineModel(df_test)
evaluate_model(pop_model, df_test, df_train, user_feat, k=10)

MAE@10: 1.7405
RMSE@10: 1.8928
NDCG@10: 0.7711
Novelty@10: 2.5669
Tag Diversity@10: 0.2254
Average Popularity@10: 1.2692


In [18]:
baseline_model = GlobalMeanBaseline()
evaluate_model(baseline_model, df_test, df_train, user_feat, k=10)

MAE@10: 0.5112
RMSE@10: 0.7481
NDCG@10: 0.8014
Novelty@10: 1.9128
Tag Diversity@10: 0.2787
Average Popularity@10: 1.2185
