# Import necessary libraries

In [1]:
# Data imports
import pandas as pd
import numpy as np

# AI imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K

SEED = 42

tf.random.set_seed(SEED)

2025-05-13 01:15:12.743396: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare Data

## Load Data

In [2]:
data = pd.read_parquet("data/data.parquet")
data

Unnamed: 0,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0.722103,-1,2020-07-05 05:27:48.378,0.939212,-0.056158,9.575698,0,0,0,1,...,297,4,2,0,1.0,1.0,1.0,0.0,0.0,0.0
1,1.907377,-1,2020-07-05 05:28:00.057,0.939212,-0.056158,9.575698,0,0,0,1,...,297,4,2,0,1.0,1.0,1.0,0.0,0.0,0.0
2,2.063311,0,2020-07-05 05:29:09.479,0.939212,-0.056158,9.575698,0,0,0,1,...,297,4,2,0,1.0,1.0,1.0,0.0,0.0,0.0
3,0.566388,0,2020-07-05 05:30:43.285,0.939212,-0.056158,9.575698,0,0,0,1,...,297,4,2,0,1.0,1.0,1.0,0.0,0.0,0.0
4,0.418364,0,2020-07-05 05:35:43.459,0.939212,-0.056158,9.575698,0,0,0,1,...,297,4,2,0,1.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4083324,0.142857,1,2020-09-01 20:06:35.984,1.087398,-0.067249,7.822461,0,0,0,0,...,272,4,2,0,0.0,1.0,0.0,1.0,0.0,1.0
4083325,1.234848,0,2020-09-02 14:44:51.342,1.087398,-0.067249,7.822461,0,0,0,0,...,272,4,2,0,0.0,1.0,0.0,1.0,0.0,1.0
4083326,1.024412,1,2020-09-03 08:45:01.474,1.087398,-0.067249,7.822461,0,0,0,0,...,272,4,2,0,0.0,1.0,0.0,1.0,0.0,1.0
4083327,0.273750,0,2020-09-04 22:56:32.021,1.087398,-0.067249,7.822461,0,0,0,0,...,272,4,2,0,0.0,1.0,0.0,1.0,0.0,1.0


## Split data into X_train / y_train / X_test / y_test

In [3]:
def split_X_y(df: pd.DataFrame):
    y = df["watch_ratio"]
    tmp = df.drop(columns=["watch_ratio"])
    X = tmp[[c for c in tmp.columns]]
    return X, y

def split_on_time(df: pd.DataFrame, split_ratio: int):
    split_value = (int)(df.shape[0] * split_ratio)
    sorted = df.sort_values("time")
    train = sorted.head(split_value)
    test = sorted.tail(df.shape[0] - split_value)
    return train, test

def split_all(df: pd.DataFrame):
    train, test = split_on_time(df, 0.8)
    train = train.drop(columns=["time"])
    test = test.drop(columns=["time"])
    X_train, y_train = split_X_y(train)
    X_test, y_test = split_X_y(test)
    test_watch_ratio_mean = test["watch_ratio_mean"]
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), test_watch_ratio_mean

In [4]:
X_train, y_train, X_test, y_test, test_watch_ratio_mean = split_all(data)

# AI

## Create the model

In [None]:
model = Sequential([
    Input(shape=(34,)),

    # Block 1
    #Dense(128, activation='relu'),
    #BatchNormalization(),
    #Dropout(0.3),

    # Block 2
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output layer: sigmoid in [0,1], scaled to [0,5]
    Dense(1, activation='sigmoid'),
    Lambda(lambda x: x * 5.0)
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mae']
)

## Train the model

In [6]:
model.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=15,
    batch_size=512,
    #class_weight=class_weights_dict,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.5480 - mae: 0.4975 - val_loss: 0.2561 - val_mae: 0.3651
Epoch 2/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.2669 - mae: 0.3550 - val_loss: 0.2330 - val_mae: 0.3264
Epoch 3/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.2567 - mae: 0.3423 - val_loss: 0.2309 - val_mae: 0.3219
Epoch 4/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.2539 - mae: 0.3401 - val_loss: 0.2301 - val_mae: 0.3193
Epoch 5/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.2530 - mae: 0.3386 - val_loss: 0.2364 - val_mae: 0.3341
Epoch 6/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - loss: 0.2527 - mae: 0.3380 - val_loss: 0.2345 - val_mae: 0.3127
Epoch 7/15
[1m5743/5743[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[

<keras.src.callbacks.history.History at 0x7b5e4072b050>

## Predict video watch_ratio and compare it to the baseline

In [7]:
baseline = np.full_like(y_test, np.mean(y_test), dtype=np.float32)
preds = model.predict(X_test, batch_size=2048, verbose=None)
mae_baseline = np.mean(np.abs(baseline - y_test))
mae_model = np.mean(np.abs(preds.flatten() - y_test))
print(mae_baseline)
print(mae_model)

0.44011308387270526
0.33044866277149026


In [5]:
np.mean(np.abs(y_test.flatten() - test_watch_ratio_mean))

np.float64(0.4275432114042026)