# Import necessary libraries

In [7]:
# Data imports
import pandas as pd
import numpy as np

# AI imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K

# Prepare environment

In [8]:
SEED = 42
tf.random.set_seed(SEED)

TRAIN_TEST_SPLIT = 0.8
FEATURES_PATH = "data/features/"

# Prepare Data

## Load Data

In [9]:
big_matrix_final = pd.read_parquet(FEATURES_PATH + "big_matrix_final.parquet")
big_matrix_final = big_matrix_final.drop(columns=["user_id", "video_id"])

small_matrix_final = pd.read_parquet(FEATURES_PATH + "small_matrix_final.parquet")
small_matrix_final = small_matrix_final.drop(columns=["user_id", "video_id"])

big_matrix_final

Unnamed: 0,watch_ratio,like,video_length,user_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,1.273397,1,0,0.981752,0.784641,0.954082,1.029649
1,0.107613,-1,0,0.981752,0.992569,0.975717,1.022317
2,0.089885,-1,0,0.981752,0.977060,0.998407,1.029649
3,0.078000,-1,0,0.981752,0.781283,0.770139,0.804945
4,1.572295,1,0,0.981752,0.971532,0.987143,0.937071
...,...,...,...,...,...,...,...
9727433,1.004462,0,0,0.790256,0.840896,0.792753,0.821723
9727434,0.313389,-1,0,0.790256,0.706640,0.889533,0.865700
9727435,0.340597,-1,0,0.790256,1.074987,0.792753,0.821723
9727436,0.913400,0,0,0.790256,1.074987,0.792753,0.821723


## Split data into X_train / y_train / X_test / y_test

In [10]:
def split_X_y(df: pd.DataFrame):
    y = df["watch_ratio"]
    tmp = df.drop(columns=["watch_ratio", "like"])
    X = tmp[[c for c in tmp.columns]]
    return X, y

def split_all(df: pd.DataFrame):
    train, test = big_matrix_final, small_matrix_final
    X_train, y_train = split_X_y(train)
    X_test, y_test = split_X_y(test)
    test_watch_ratio_mean = test["video_watch_ratio_mean"]
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), test_watch_ratio_mean

In [11]:
X_train, y_train, X_test, y_test, test_watch_ratio_mean = split_all(big_matrix_final)

# AI

## Create the model

In [12]:
model = Sequential([
    Input(shape=(5,)),

    # Block 1
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output layer: sigmoid in [0,1], scaled to [0,5]
    Dense(1, activation='sigmoid'),
    Lambda(lambda x: x * 5.0)
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mae']
)

## Train the model

In [13]:
model.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=2,
    batch_size=1024,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/2
[1m8550/8550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - loss: 0.5228 - mae: 0.4873 - val_loss: 0.3854 - val_mae: 0.4148
Epoch 2/2
[1m8550/8550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 0.3840 - mae: 0.4160 - val_loss: 0.3848 - val_mae: 0.4171


<keras.src.callbacks.history.History at 0x7ce627ae6050>

## Predict video watch_ratio and compare it to the baseline

The baseline is the average watch_ratio of all videos.\
We can see that the MAE of my model is arounf 25% better than the baseline.

In [14]:
baseline = np.full_like(y_test, np.mean(y_test), dtype=np.float32)
preds = model.predict(X_test, batch_size=2048, verbose=None)
mae_baseline = np.mean(np.abs(baseline - y_test))
mae_model = np.mean(np.abs(preds.flatten() - y_test))
print(f"Baseline MAE: {mae_baseline}")
print(f"Actual MAE: {mae_model}")

Baseline MAE: 0.4225401158417161
Actual MAE: 0.3613954467451218


## Compute another, better baseline

Here is a better baseline, using the average watch ratio of the video predicted at the time of the prediction.\
It is slightly better than the previous one, but my model still predicts watch ratios more accurately.

In [15]:
np.mean(np.abs(y_test.flatten() - test_watch_ratio_mean))

np.float64(0.32525378616261336)

# Save the model

In [16]:
model.save(FEATURES_PATH + "ai_model.keras")