# Imports

In [1]:
SEED = 69
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from typing import Tuple, Dict
import plotly.express as px
import plotly.figure_factory as ff
from scipy import stats
import ast
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K
#######################################
np.random.seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

2025-05-13 17:48:18.060505: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747151298.070236   55690 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747151298.073165   55690 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747151298.081912   55690 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747151298.081920   55690 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747151298.081921   55690 computation_placer.cc:177] computation placer alr

# Loading the data

In [2]:
root = "data_final_project/KuaiRec 2.0/data/"
train_data = pd.read_parquet("./exports/train_data.pq")
test_data = pd.read_parquet("./exports/test_data.pq")

In [3]:
train_data.columns

Index(['user_id', 'video_id', 'video_duration', 'timestamp', 'watch_ratio',
       'friend_list', 'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3',
       'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10',
       'feat_11', 'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16',
       'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22',
       'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28',
       'feat_29', 'feat_30', 'user_active_degree', 'is_lowactive_period',
       'is_live_streamer', 'is_video_author', 'follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'onehot_feat0',
       'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4',
       'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8',
       'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12',
       'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16',
       'onehot_feat17', 'first_level_catego

In [4]:
train_data.dtypes

user_id                        int64
video_id                       int64
video_duration                 int64
timestamp             datetime64[ns]
watch_ratio                  float64
                           ...      
download_cnt                 float64
cancel_like_cnt              float64
delete_comment_cnt           float64
cancel_follow_cnt            float64
cancel_collect_cnt           float64
Length: 77, dtype: object

In [None]:
df = train_data.copy()
df['timestamp'] = pd.to_datetime(df['timestamp'])

def parse_friends(val):
        if isinstance(val, str):
            try:
                return list(map(int, ast.literal_eval(val)))
            except:
                return []
        if isinstance(val, (list, np.ndarray)):
            return list(map(int, val))
        return []
df["friend_list"] = df["friend_list"].apply(parse_friends)

def friends_watch_ratio(df):
    exploded = df[['user_id', 'video_id', 'timestamp', 'friend_list']].explode('friend_list').rename(columns={'friend_list': 'friend_id'})
    friend_watches = df[['user_id', 'video_id', 'watch_ratio', 'timestamp']].rename(columns={'user_id': 'friend_id', 'watch_ratio': 'friend_watch_ratio', 'timestamp': 'friend_timestamp'})
    merged = exploded.merge(friend_watches, on=['friend_id', 'video_id'], how='left')
    merged = merged[merged['friend_timestamp'] <= merged['timestamp']]
    mean_ratios = merged.groupby(['user_id', 'video_id'])['friend_watch_ratio'].mean().reset_index()
    df = df.merge(mean_ratios, on=['user_id', 'video_id'], how='left')
    df['friend_watch_ratio'] = df['friend_watch_ratio'].fillna(df['video_cum_avg_watch_ratio'])
    return df

def mean_by_first_level_cat(df):
    df = df.sort_values(['user_id', 'first_level_category_id', 'timestamp'])
    df['cumulative_avg_watch_ratio'] = (
        df.groupby(['user_id', 'first_level_category_id'])['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=[0,1], drop=True)
    )
    return df

def add_video_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['video_id', 'timestamp'])
    df['video_cum_avg_watch_ratio'] = (
        df.groupby('video_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

def add_user_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['user_id', 'timestamp'])
    df['user_cum_avg_watch_ratio'] = (
        df.groupby('user_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

df = add_user_cumulative_avg_watch_ratio(df)
df = add_video_cumulative_avg_watch_ratio(df)
df = mean_by_first_level_cat(df)
df = friends_watch_ratio(df)

df = df.drop(columns=["friend_list"])
df = df.drop(columns=["user_active_degree"])
df = df.drop(columns=["is_lowactive_period"])

to_remove = [col for col in df.columns if abs(df["watch_ratio"].corr(df[col])) < 0.1 and col != "user_id"] + ["feat_24"]
print(to_remove)
df = df.drop(columns=to_remove)
df.corr()

['video_id', 'timestamp', 'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'is_live_streamer', 'is_video_author', 'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17', 'first_level_category_id', 'comment_cnt', 'share_cnt', 'collect_cnt', 'download_cnt', 'cancel_like_cnt', 'delete_comment_cnt', 'cancel_collect_cnt', 'feat_24', 'feat_12']


Unnamed: 0,user_id,video_duration,watch_ratio,show_cnt,valid_play_cnt,like_cnt,follow_cnt,cancel_follow_cnt,user_cum_avg_watch_ratio,video_cum_avg_watch_ratio,cumulative_avg_watch_ratio,friend_watch_ratio
user_id,1.0,0.000316,-0.00519,-8e-05,-0.000151,0.000199,0.000244,7.8e-05,-0.011658,0.000304,-0.008172,0.001473
video_duration,0.000316,1.0,-0.302293,0.19757,0.178246,0.193067,0.284521,0.284614,-0.012601,-0.572807,-0.239647,-0.552886
watch_ratio,-0.00519,-0.302293,1.0,-0.12217,-0.114531,-0.101451,-0.121254,-0.112576,0.251973,0.52942,0.39912,0.514494
show_cnt,-8e-05,0.19757,-0.12217,1.0,0.986783,0.768243,0.617032,0.581491,-0.002585,-0.228653,-0.08493,-0.220124
valid_play_cnt,-0.000151,0.178246,-0.114531,0.986783,1.0,0.748879,0.569955,0.536802,-0.002387,-0.213833,-0.078255,-0.205824
like_cnt,0.000199,0.193067,-0.101451,0.768243,0.748879,1.0,0.756034,0.53164,-0.003046,-0.190448,-0.06578,-0.183541
follow_cnt,0.000244,0.284521,-0.121254,0.617032,0.569955,0.756034,1.0,0.61036,-0.001359,-0.229283,-0.105391,-0.221095
cancel_follow_cnt,7.8e-05,0.284614,-0.112576,0.581491,0.536802,0.53164,0.61036,1.0,-0.001298,-0.213229,-0.096515,-0.205623
user_cum_avg_watch_ratio,-0.011658,-0.012601,0.251973,-0.002585,-0.002387,-0.003046,-0.001359,-0.001298,1.0,0.017909,0.648614,0.026464
video_cum_avg_watch_ratio,0.000304,-0.572807,0.52942,-0.228653,-0.213833,-0.190448,-0.229283,-0.213229,0.017909,1.0,0.289912,0.965517


In [19]:
df.columns

Index(['user_id', 'video_duration', 'watch_ratio', 'show_cnt',
       'valid_play_cnt', 'like_cnt', 'follow_cnt', 'cancel_follow_cnt',
       'user_cum_avg_watch_ratio', 'video_cum_avg_watch_ratio',
       'cumulative_avg_watch_ratio', 'friend_watch_ratio'],
      dtype='object')

In [20]:
train = train_data.copy().dropna()
test = test_data.copy().dropna()

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = df.dropna()

target = 'watch_ratio'
exclude = ['user_id', target]
features = [col for col in df.columns if col not in exclude]

X = df[features].values
y = df[target].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [22]:
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),

    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(128, activation='relu'),
    layers.Dense(1)
])

In [23]:
from tensorflow import keras

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='mean_squared_error',
    metrics=['mae']
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=1024,
    callbacks=[keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True)]
)

Epoch 1/10


2025-05-13 17:51:52.415059: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m2657/2668[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 0.3933 - mae: 0.4224

2025-05-13 17:51:58.274207: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.3929 - mae: 0.4221 - val_loss: 0.2237 - val_mae: 0.3047
Epoch 2/10
[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.2354 - mae: 0.3214 - val_loss: 0.2240 - val_mae: 0.3029
Epoch 3/10
[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.2263 - mae: 0.3109 - val_loss: 0.2223 - val_mae: 0.3005
Epoch 4/10
[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.2228 - mae: 0.3066 - val_loss: 0.2220 - val_mae: 0.2993
Epoch 5/10
[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.2212 - mae: 0.3042 - val_loss: 0.2213 - val_mae: 0.2985
Epoch 6/10
[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.2203 - mae: 0.3029 - val_loss: 0.2210 - val_mae: 0.2983
Epoch 7/10
[1m2668/2668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

mae_df = test_data.copy()
user_avg = mae_df.groupby("user_id")["watch_ratio"].mean()
mae_df["baseline_pred"] = mae_df["user_id"].map(user_avg).fillna(mae_df["watch_ratio"].mean())

mae = mean_absolute_error(mae_df["watch_ratio"], mae_df["baseline_pred"])
print(f"Baseline MAE: {mae:.4f}")

Baseline MAE: 0.4159


In [25]:
print(len(test_data))
print(len(train_data))

455362
3414860


### EVALUATE THE MODEL

In [26]:
df = test_data.copy()
df['timestamp'] = pd.to_datetime(df['timestamp'])

def parse_friends(val):
        if isinstance(val, str):
            try:
                return list(map(int, ast.literal_eval(val)))
            except:
                return []
        if isinstance(val, (list, np.ndarray)):
            return list(map(int, val))
        return []
df["friend_list"] = df["friend_list"].apply(parse_friends)

def friends_watch_ratio(df):
    exploded = df[['user_id', 'video_id', 'timestamp', 'friend_list']].explode('friend_list')
    exploded = exploded.rename(columns={'friend_list': 'friend_id'})

    friend_watches = df[['user_id', 'video_id', 'watch_ratio', 'timestamp']].rename(columns={
        'user_id': 'friend_id',
        'watch_ratio': 'friend_watch_ratio',
        'timestamp': 'friend_timestamp'
    })
    merged = exploded.merge(friend_watches, on=['friend_id', 'video_id'], how='left')
    merged = merged[merged['friend_timestamp'] <= merged['timestamp']]
    mean_ratios = merged.groupby(['user_id', 'video_id'])['friend_watch_ratio'].mean().reset_index()
    df = df.merge(mean_ratios, on=['user_id', 'video_id'], how='left')
    df['friend_watch_ratio'] = df['friend_watch_ratio'].fillna(df['video_cum_avg_watch_ratio'])
    return df

def mean_by_first_level_cat(df):
    df = df.sort_values(['user_id', 'first_level_category_id', 'timestamp'])
    df['cumulative_avg_watch_ratio'] = (
        df.groupby(['user_id', 'first_level_category_id'])['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=[0,1], drop=True)
    )
    return df

def add_video_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['video_id', 'timestamp'])
    df['video_cum_avg_watch_ratio'] = (
        df.groupby('video_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

def add_user_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['user_id', 'timestamp'])
    df['user_cum_avg_watch_ratio'] = (
        df.groupby('user_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

df = add_user_cumulative_avg_watch_ratio(df)
df = add_video_cumulative_avg_watch_ratio(df)
df = mean_by_first_level_cat(df)
df = friends_watch_ratio(df)

df = df.drop(columns=["friend_list"])
df = df.drop(columns=["user_active_degree"])
df = df.drop(columns=["is_lowactive_period"])


print(to_remove)
df = df.drop(columns=to_remove)
df.corr()

['video_id', 'timestamp', 'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'is_live_streamer', 'is_video_author', 'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17', 'first_level_category_id', 'comment_cnt', 'share_cnt', 'collect_cnt', 'download_cnt', 'cancel_like_cnt', 'delete_comment_cnt', 'cancel_collect_cnt', 'feat_24', 'feat_12']


Unnamed: 0,user_id,video_duration,watch_ratio,show_cnt,valid_play_cnt,like_cnt,follow_cnt,cancel_follow_cnt,user_cum_avg_watch_ratio,video_cum_avg_watch_ratio,cumulative_avg_watch_ratio,friend_watch_ratio
user_id,1.0,-0.003025,9.3e-05,-0.005361,-0.005072,-0.003866,-0.004412,-0.003685,0.004736,0.001965,0.003461,0.004307
video_duration,-0.003025,1.0,-0.373594,0.301633,0.267983,0.332009,0.501292,0.536518,-0.02383,-0.671264,-0.336079,-0.653534
watch_ratio,9.3e-05,-0.373594,1.0,-0.19538,-0.179242,-0.161722,-0.208727,-0.195745,0.331801,0.558613,0.594167,0.54705
show_cnt,-0.005361,0.301633,-0.19538,1.0,0.990804,0.72735,0.577946,0.551735,-0.014445,-0.3459,-0.115497,-0.336907
valid_play_cnt,-0.005072,0.267983,-0.179242,0.990804,1.0,0.696954,0.51794,0.504624,-0.014281,-0.316701,-0.098374,-0.308448
like_cnt,-0.003866,0.332009,-0.161722,0.72735,0.696954,1.0,0.691173,0.612619,-0.011196,-0.287753,-0.104132,-0.28039
follow_cnt,-0.004412,0.501292,-0.208727,0.577946,0.51794,0.691173,1.0,0.72021,-0.006932,-0.375265,-0.185987,-0.365377
cancel_follow_cnt,-0.003685,0.536518,-0.195745,0.551735,0.504624,0.612619,0.72021,1.0,-0.008125,-0.351313,-0.170844,-0.341933
user_cum_avg_watch_ratio,0.004736,-0.02383,0.331801,-0.014445,-0.014281,-0.011196,-0.006932,-0.008125,1.0,0.056837,0.579359,0.060123
video_cum_avg_watch_ratio,0.001965,-0.671264,0.558613,-0.3459,-0.316701,-0.287753,-0.375265,-0.351313,0.056837,1.0,0.367508,0.974025


In [27]:
df = df.dropna()

target = 'watch_ratio'
exclude = ['user_id', target]
features = [col for col in df.columns if col not in exclude]

X_test = df[features].values
y_test = df[target].values

X_test = scaler.transform(X_test)

In [28]:
y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
print(f"Model MAE: {mae:.4f}")

sample_df = pd.DataFrame({
    "actual": y_test,
    "predicted": y_pred
}).sample(10, random_state=2)

print(sample_df)

[1m  372/14231[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 406us/step

2025-05-13 17:52:51.816771: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_14}}


[1m14231/14231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 431us/step
Model MAE: 0.2705
          actual  predicted
339518  0.888182   1.171848
332515  3.099000   1.937821
455142  0.150519   0.224828
199649  0.687155   0.597039
333678  0.000000   0.695607
183871  1.478027   0.952228
268571  0.497572   0.600312
341839  0.940808   0.894541
216001  0.974329   1.420365
340194  0.253350   0.348805
