# Import necessary libraries

In [1]:
# Data imports
import pandas as pd
import numpy as np

# AI imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K

SEED = 42
TRAIN_TEST_SPLIT = 0.8

tf.random.set_seed(SEED)

2025-05-13 15:50:26.510414: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare Data

## Load Data

In [2]:
data = pd.read_parquet("data/data.parquet")
data

Unnamed: 0,time,watch_ratio,watch_ratio_mean,watch_ratio_prior_mean
0,2020-07-04 02:23:26.060,1.981442,1.037910,0.000000
1,2020-07-04 04:03:18.888,0.964642,1.037910,0.000000
2,2020-07-04 04:03:53.725,0.112044,1.037910,0.000000
3,2020-07-04 04:35:24.528,0.700000,1.037910,0.000000
4,2020-07-04 06:32:23.949,0.906852,1.037910,0.000000
...,...,...,...,...
3857008,2020-09-05 23:52:40.419,0.171522,0.937582,0.273859
3857009,2020-09-05 23:52:56.230,0.612817,0.759180,0.892070
3857010,2020-09-05 23:53:50.831,0.451241,0.724104,0.826597
3857011,2020-09-05 23:57:15.282,0.871151,0.711956,0.989261


## Split data into X_train / y_train / X_test / y_test

In [3]:
def split_X_y(df: pd.DataFrame):
    y = df["watch_ratio"]
    tmp = df.drop(columns=["watch_ratio"])
    X = tmp[[c for c in tmp.columns]]
    return X, y

def split_on_time(df: pd.DataFrame, split_ratio: int):
    split_value = (int)(df.shape[0] * split_ratio)
    sorted = df.sort_values("time")
    train = sorted.head(split_value)
    test = sorted.tail(df.shape[0] - split_value)
    return train, test

def split_all(df: pd.DataFrame):
    train, test = split_on_time(df, TRAIN_TEST_SPLIT)
    train = train.drop(columns=["time"])
    test = test.drop(columns=["time"])
    X_train, y_train = split_X_y(train)
    X_test, y_test = split_X_y(test)
    test_watch_ratio_mean = test["watch_ratio_mean"]
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), test_watch_ratio_mean

In [4]:
X_train, y_train, X_test, y_test, test_watch_ratio_mean = split_all(data)

# AI

## Create the model

In [5]:
model = Sequential([
    Input(shape=(2,)),

    # Block 1
    #Dense(128, activation='relu'),
    #BatchNormalization(),
    #Dropout(0.3),

    # Block 2
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output layer: sigmoid in [0,1], scaled to [0,5]
    Dense(1, activation='sigmoid'),
    Lambda(lambda x: x * 5.0)
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mae']
)

## Train the model

In [13]:
model.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=4096,
    #class_weight=class_weights_dict,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.2343 - mae: 0.3123 - val_loss: 0.2153 - val_mae: 0.2915
Epoch 2/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.2342 - mae: 0.3121 - val_loss: 0.2153 - val_mae: 0.2910
Epoch 3/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.2342 - mae: 0.3122 - val_loss: 0.2153 - val_mae: 0.2913
Epoch 4/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.2343 - mae: 0.3122 - val_loss: 0.2153 - val_mae: 0.2910
Epoch 5/10
[1m678/678[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.2342 - mae: 0.3122 - val_loss: 0.2153 - val_mae: 0.2912
Epoch 6/10
[1m206/678[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1s[0m 3ms/step - loss: 0.2339 - mae: 0.3123

KeyboardInterrupt: 

## Predict video watch_ratio and compare it to the baseline

In [14]:
baseline = np.full_like(y_test, np.mean(y_test), dtype=np.float32)
preds = model.predict(X_test, batch_size=2048, verbose=None)
mae_baseline = np.mean(np.abs(baseline - y_test))
mae_model = np.mean(np.abs(preds.flatten() - y_test))
print(f"Baseline MAE: {mae_baseline}")
print(f"Actual MAE: {mae_model}")

Baseline MAE: 0.43516791225066387
Actual MAE: 0.3081290816521422


In [8]:
np.mean(np.abs(y_test.flatten() - test_watch_ratio_mean))

np.float64(0.4275432114042026)