Notes for myself:
- maybe add the upload_dt column when taking columns from item_daily_features to be able to apply a penalty on videos posted a while ago

# Imports

In [1]:
SEED = 69
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from typing import Tuple, Dict
import plotly.express as px
import plotly.figure_factory as ff
from scipy import stats
import ast
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K
#######################################
np.random.seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

2025-05-13 01:37:15.119052: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747093035.128935  233227 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747093035.131903  233227 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747093035.140627  233227 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747093035.140634  233227 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747093035.140635  233227 computation_placer.cc:177] computation placer alr

# Loading the data

In [2]:
root = "data_final_project/KuaiRec 2.0/data/"
train_data = pd.read_parquet("./exports/train_data.pq")
test_data = pd.read_parquet("./exports/test_data.pq")

In [3]:
train_data.columns

Index(['user_id', 'video_id', 'timestamp', 'watch_ratio', 'friend_list',
       'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4',
       'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11',
       'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17',
       'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23',
       'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29',
       'feat_30', 'video_tag_id', 'video_duration', 'show_cnt',
       'valid_play_cnt', 'play_progress', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'download_cnt',
       'cancel_like_cnt', 'delete_comment_cnt', 'cancel_follow_cnt',
       'cancel_collect_cnt', 'date', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3',
       'one

In [4]:
train_data.dtypes

user_id                             int64
video_id                            int64
timestamp                  datetime64[ns]
watch_ratio                       float64
friend_list                        object
                                ...      
onehot_feat14                     float64
onehot_feat15                     float64
onehot_feat16                     float64
onehot_feat17                     float64
first_level_category_id             int64
Length: 80, dtype: object

In [None]:
df = train_data.copy()

def mean_by_first_level_cat(df):
    user_category_avg_df = df.pivot_table(
        index='user_id',
        columns='first_level_category_id',
        values='watch_ratio',
        aggfunc='mean'
    ).fillna(0)
    user_category_avg_df.columns = [f'avg_watch_ratio_cat_{col}' for col in user_category_avg_df.columns]
    df = df.merge(user_category_avg_df, on='user_id', how='left')
    return df

def age_of_video(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = pd.to_datetime(df['date'])
    df['video_age'] = (df['timestamp'] - df['date']).dt.days * 10
    return df


df = mean_by_first_level_cat(df)
df = age_of_video(df)
df = friends_watch_ratio(df)

df = df.drop(columns=["friend_list"])
df = df.drop(columns=["user_active_degree"])
df = df.drop(columns=["is_lowactive_period"])

feat_cols = [col for col in df.columns if col.startswith('feat_')]

df_long = df[['user_id', 'watch_ratio'] + feat_cols].melt(
    id_vars=['user_id', 'watch_ratio'],
    value_vars=feat_cols,
    var_name='feature',
    value_name='is_active'
)

df_active = df_long[df_long['is_active'] == 1]

user_feat_avg = df_active.groupby(['user_id', 'feature'])['watch_ratio'].mean().reset_index()
user_feat_avg.rename(columns={'watch_ratio': 'user_feat_avg_watch_ratio'}, inplace=True)

user_feat_wide = user_feat_avg.pivot(index='user_id', columns='feature', values='user_feat_avg_watch_ratio')
user_feat_wide.columns = [f'user_feat_avg_watch_ratio_{col.split("_")[1]}' for col in user_feat_wide.columns]
user_feat_wide = user_feat_wide.reset_index()

df = df.merge(user_feat_wide, on='user_id', how='left')


to_remove = [col for col in df.columns if abs(df["watch_ratio"].corr(df[col])) < 0.05 and col != "user_id"] + ["feat_24"]
print(to_remove)
df = df.drop(columns=to_remove)
df.corr()

['video_id', 'timestamp', 'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_7', 'feat_9', 'feat_10', 'feat_11', 'feat_13', 'feat_14', 'feat_15', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_25', 'feat_26', 'feat_27', 'feat_29', 'feat_30', 'video_tag_id', 'delete_comment_cnt', 'date', 'is_live_streamer', 'is_video_author', 'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17', 'first_level_category_id', 'video_age', 'feat_24']


Unnamed: 0,user_id,watch_ratio,feat_6,feat_8,feat_12,feat_16,feat_28,video_duration,show_cnt,valid_play_cnt,...,user_feat_avg_watch_ratio_28,user_feat_avg_watch_ratio_29,user_feat_avg_watch_ratio_3,user_feat_avg_watch_ratio_30,user_feat_avg_watch_ratio_4,user_feat_avg_watch_ratio_5,user_feat_avg_watch_ratio_6,user_feat_avg_watch_ratio_7,user_feat_avg_watch_ratio_8,user_feat_avg_watch_ratio_9
user_id,1.000000,-0.004572,-0.000104,0.000773,0.000207,-0.000047,-0.000474,0.000264,0.000241,0.000239,...,-0.018568,-0.028293,-0.013153,-0.044300,-0.013975,-0.026897,-0.012968,-0.014888,-0.025624,-0.007151
watch_ratio,-0.004572,1.000000,-0.086545,0.073450,-0.139917,-0.084922,0.088004,-0.337257,-0.202697,-0.188254,...,0.253129,0.112138,0.119745,0.145648,0.222456,0.236942,0.250498,0.247427,0.250289,0.247640
feat_6,-0.000104,-0.086545,1.000000,-0.008824,-0.055507,-0.052097,-0.189988,0.122025,0.128402,0.115645,...,0.000370,0.000567,0.000724,0.000191,0.000345,0.000511,0.000713,0.000422,0.000542,0.000553
feat_8,0.000773,0.073450,-0.008824,1.000000,-0.086176,-0.054660,-0.237838,-0.098850,-0.113769,-0.128148,...,0.002763,0.001119,0.001865,0.000738,0.002874,0.003308,0.002277,0.002980,0.002776,0.002195
feat_12,0.000207,-0.139917,-0.055507,-0.086176,1.000000,-0.018505,-0.113404,0.250859,0.050939,0.054314,...,-0.000167,-0.000177,-0.000145,0.000861,-0.000850,-0.000827,0.000165,-0.000861,-0.000663,0.000484
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_feat_avg_watch_ratio_5,-0.026897,0.236942,0.000511,0.003308,-0.000827,-0.003105,0.000039,-0.006035,-0.004986,-0.004504,...,0.906665,0.399991,0.425839,0.500743,0.809996,1.000000,0.884788,0.907572,0.920893,0.860812
user_feat_avg_watch_ratio_6,-0.012968,0.250498,0.000713,0.002277,0.000165,-0.003554,-0.000284,-0.006238,-0.003845,-0.003445,...,0.962062,0.438680,0.451518,0.560347,0.833941,0.884788,1.000000,0.935413,0.954547,0.959023
user_feat_avg_watch_ratio_7,-0.014888,0.247427,0.000422,0.002980,-0.000861,-0.003524,0.000132,-0.006821,-0.004602,-0.004201,...,0.946419,0.435504,0.448950,0.547021,0.842734,0.907572,0.935413,1.000000,0.947780,0.916194
user_feat_avg_watch_ratio_8,-0.025624,0.250289,0.000542,0.002776,-0.000663,-0.003438,0.000149,-0.006024,-0.004256,-0.003835,...,0.950932,0.425105,0.454810,0.543515,0.847009,0.920893,0.954547,0.947780,1.000000,0.924089


In [6]:
df.columns

Index(['user_id', 'watch_ratio', 'feat_6', 'feat_8', 'feat_12', 'feat_16',
       'feat_28', 'video_duration', 'show_cnt', 'valid_play_cnt',
       'play_progress', 'like_cnt', 'comment_cnt', 'share_cnt', 'follow_cnt',
       'collect_cnt', 'download_cnt', 'cancel_like_cnt', 'cancel_follow_cnt',
       'cancel_collect_cnt', 'avg_watch_ratio_cat_1', 'avg_watch_ratio_cat_2',
       'avg_watch_ratio_cat_3', 'avg_watch_ratio_cat_4',
       'avg_watch_ratio_cat_5', 'avg_watch_ratio_cat_6',
       'avg_watch_ratio_cat_7', 'avg_watch_ratio_cat_8',
       'avg_watch_ratio_cat_9', 'avg_watch_ratio_cat_10',
       'avg_watch_ratio_cat_11', 'avg_watch_ratio_cat_12',
       'avg_watch_ratio_cat_13', 'avg_watch_ratio_cat_14',
       'avg_watch_ratio_cat_15', 'avg_watch_ratio_cat_16',
       'avg_watch_ratio_cat_17', 'avg_watch_ratio_cat_18',
       'avg_watch_ratio_cat_19', 'avg_watch_ratio_cat_20',
       'avg_watch_ratio_cat_21', 'avg_watch_ratio_cat_22',
       'avg_watch_ratio_cat_23', 'avg_wat

In [7]:
train = train_data.copy().dropna()
test = test_data.copy().dropna()

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = df.dropna()

target = 'watch_ratio'
exclude = ['user_id', target]
features = [col for col in df.columns if col not in exclude]

X = df[features].values
y = df[target].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [9]:
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),

    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.2),

    layers.Dense(128, activation='relu'),
    layers.Dense(1)
])

I0000 00:00:1747093084.044819  233227 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13507 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4080 SUPER, pci bus id: 0000:01:00.0, compute capability: 8.9


In [10]:
from tensorflow import keras

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='mean_squared_error',
    metrics=['mae']
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=2,
    batch_size=256,
    callbacks=[keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True)]
)

2025-05-13 01:38:04.677135: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 792961848 exceeds 10% of free system memory.
2025-05-13 01:38:05.230790: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 792961848 exceeds 10% of free system memory.


Epoch 1/2


2025-05-13 01:38:05.515564: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 792961848 exceeds 10% of free system memory.
2025-05-13 01:38:05.815088: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m8880/8901[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 0.2852 - mae: 0.3887

2025-05-13 01:38:28.654059: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_15}}


[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - loss: 0.2851 - mae: 0.3885 - val_loss: 0.1617 - val_mae: 0.2798
Epoch 2/2
[1m8901/8901[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - loss: 0.1647 - mae: 0.2863 - val_loss: 0.1603 - val_mae: 0.2779


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)
user_avg = df_train.groupby("user_id")["watch_ratio"].mean()
df_val["baseline_pred"] = df_val["user_id"].map(user_avg).fillna(df_train["watch_ratio"].mean())

mae = mean_absolute_error(df_val["watch_ratio"], df_val["baseline_pred"])
print(f"Baseline MAE: {mae:.4f}")

Baseline MAE: 0.3738


### EVALUATE THE MODEL

In [12]:
df = test_data.copy()

def friends_watch_ratio(df):
    df = df.copy()

    def parse_friends(val):
        if isinstance(val, str):
            try:
                return list(map(int, ast.literal_eval(val)))
            except:
                return []
        if isinstance(val, (list, np.ndarray)):
            return list(map(int, val))
        return []

    df['friend_list'] = df['friend_list'].apply(parse_friends)

    exploded = df[['user_id', 'video_id', 'friend_list']].explode('friend_list')
    exploded = exploded.rename(columns={'friend_list': 'friend_id'})
    
    friend_watches = df[['user_id', 'video_id', 'watch_ratio']]
    friend_watches = friend_watches.rename(columns={'user_id': 'friend_id', 'watch_ratio': 'friend_watch_ratio'})

    merged = exploded.merge(friend_watches, on=['friend_id', 'video_id'], how='left')

    mean_ratios = merged.groupby(['user_id', 'video_id'])['friend_watch_ratio'].mean().reset_index()

    video_avg = df.groupby('video_id')['watch_ratio'].mean().rename('video_avg_ratio')

    df = df.merge(mean_ratios, on=['user_id', 'video_id'], how='left')
    df = df.merge(video_avg, on='video_id', how='left')

    df['friend_watch_ratio'] = df['friend_watch_ratio'].fillna(df['video_avg_ratio'])

    return df.drop(columns=['video_avg_ratio'])


def mean_by_first_level_cat(df):
    user_category_avg_df = df.pivot_table(
        index='user_id',
        columns='first_level_category_id',
        values='watch_ratio',
        aggfunc='mean'
    ).fillna(0)
    user_category_avg_df.columns = [f'avg_watch_ratio_cat_{col}' for col in user_category_avg_df.columns]
    df = df.merge(user_category_avg_df, on='user_id', how='left')
    return df

def age_of_video(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = pd.to_datetime(df['date'])
    df['video_age'] = (df['timestamp'] - df['date']).dt.days * 10
    return df


df = mean_by_first_level_cat(df)
df = age_of_video(df)
df = friends_watch_ratio(df)

df = df.drop(columns=["friend_list"])
df = df.drop(columns=["user_active_degree"])
df = df.drop(columns=["is_lowactive_period"])

feat_cols = [col for col in df.columns if col.startswith('feat_')]

df_long = df[['user_id', 'watch_ratio'] + feat_cols].melt(
    id_vars=['user_id', 'watch_ratio'],
    value_vars=feat_cols,
    var_name='feature',
    value_name='is_active'
)

df_active = df_long[df_long['is_active'] == 1]

user_feat_avg = df_active.groupby(['user_id', 'feature'])['watch_ratio'].mean().reset_index()
user_feat_avg.rename(columns={'watch_ratio': 'user_feat_avg_watch_ratio'}, inplace=True)

user_feat_wide = user_feat_avg.pivot(index='user_id', columns='feature', values='user_feat_avg_watch_ratio')
user_feat_wide.columns = [f'user_feat_avg_watch_ratio_{col.split("_")[1]}' for col in user_feat_wide.columns]
user_feat_wide = user_feat_wide.reset_index()

df = df.merge(user_feat_wide, on='user_id', how='left')


print(to_remove)
df = df.drop(columns=to_remove)
df.corr()

['video_id', 'timestamp', 'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_7', 'feat_9', 'feat_10', 'feat_11', 'feat_13', 'feat_14', 'feat_15', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_25', 'feat_26', 'feat_27', 'feat_29', 'feat_30', 'video_tag_id', 'delete_comment_cnt', 'date', 'is_live_streamer', 'is_video_author', 'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days', 'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4', 'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8', 'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12', 'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16', 'onehot_feat17', 'first_level_category_id', 'video_age', 'feat_24']


Unnamed: 0,user_id,watch_ratio,feat_6,feat_8,feat_12,feat_16,feat_28,video_duration,show_cnt,valid_play_cnt,...,user_feat_avg_watch_ratio_28,user_feat_avg_watch_ratio_29,user_feat_avg_watch_ratio_3,user_feat_avg_watch_ratio_30,user_feat_avg_watch_ratio_4,user_feat_avg_watch_ratio_5,user_feat_avg_watch_ratio_6,user_feat_avg_watch_ratio_7,user_feat_avg_watch_ratio_8,user_feat_avg_watch_ratio_9
user_id,1.000000,-0.003428,-0.001109,-0.002670,-0.000341,0.000335,0.002713,-0.002775,0.001064,0.001026,...,-0.008315,0.010823,0.039229,0.019781,-0.033004,-0.016664,-0.041260,-0.026670,-0.025025,-0.007553
watch_ratio,-0.003428,1.000000,-0.084784,0.040405,-0.101701,-0.185673,0.159513,-0.414713,-0.190443,-0.167341,...,0.312509,0.091443,0.113643,0.088287,0.183688,0.193169,0.259215,0.267077,0.293697,0.295289
feat_6,-0.001109,-0.084784,1.000000,0.087395,-0.065517,-0.066794,-0.159124,0.210955,0.147120,0.142420,...,-0.008391,-0.001852,-0.002647,-0.002554,-0.006636,-0.005406,-0.003600,-0.007519,-0.009262,-0.008153
feat_8,-0.002670,0.040405,0.087395,1.000000,-0.080044,-0.078560,-0.284386,-0.080062,-0.102780,-0.102163,...,-0.010623,-0.006376,-0.007351,-0.002461,-0.008602,-0.005297,-0.000344,-0.008454,-0.011508,-0.008322
feat_12,-0.000341,-0.101701,-0.065517,-0.080044,1.000000,0.007238,-0.148618,0.168358,-0.031538,-0.042713,...,0.002138,-0.003610,-0.001539,0.000468,0.002239,0.002494,-0.000901,0.001281,0.002325,-0.000249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user_feat_avg_watch_ratio_5,-0.016664,0.193169,-0.005406,-0.005297,0.002494,0.000979,0.000786,-0.000903,-0.000765,-0.000917,...,0.574362,0.114261,0.296143,0.155516,0.365400,1.000000,0.503508,0.481330,0.577496,0.551753
user_feat_avg_watch_ratio_6,-0.041260,0.259215,-0.003600,-0.000344,-0.000901,-0.000159,-0.000341,-0.009271,-0.003519,-0.003645,...,0.783275,0.227734,0.283648,0.238470,0.455811,0.503508,1.000000,0.665663,0.749374,0.765757
user_feat_avg_watch_ratio_7,-0.026670,0.267077,-0.007519,-0.008454,0.001281,0.003631,0.007802,-0.001951,0.002678,0.002475,...,0.805332,0.267517,0.247327,0.246538,0.455694,0.481330,0.665663,1.000000,0.782293,0.761863
user_feat_avg_watch_ratio_8,-0.025025,0.293697,-0.009262,-0.011508,0.002325,0.004744,0.008190,0.000916,0.004962,0.004537,...,0.890994,0.237635,0.340361,0.251788,0.536290,0.577496,0.749374,0.782293,1.000000,0.851230


In [13]:
df = df.dropna()

target = 'watch_ratio'
exclude = ['user_id', target]
features = [col for col in df.columns if col not in exclude]

X_test = df[features].values
y_test = df[target].values

X_test = scaler.transform(X_test)

In [14]:
y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
print(f"Model MAE: {mae:.4f}")

sample_df = pd.DataFrame({
    "actual": y_test,
    "predicted": y_pred
}).sample(10, random_state=42)

print(sample_df)

[1m224/313[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 450us/step

2025-05-13 01:38:59.312905: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your graph generating binary is newer  than this binary. Unknown attributes will be ignored. NodeDef: {{node ParallelMapDatasetV2/_14}}


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 565us/step
Model MAE: 0.2475
        actual  predicted
1617  0.620438   0.785538
8128  0.581945   0.819163
2168  0.806260   0.875116
1090  1.196972   0.980962
7754  0.601124   0.798312
8071  0.481056   0.756981
7423  0.915581   1.292495
8984  0.224603   0.300938
2310  1.949014   1.074757
7256  0.167678   0.201855
