# Import necessary libraries

In [1]:
# Data imports
import pandas as pd
import numpy as np

# AI imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K

2025-05-14 15:55:09.837282: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare environment

In [2]:
SEED = 42
tf.random.set_seed(SEED)

TRAIN_TEST_SPLIT = 0.8
FEATURES_PATH = "data/features/"

# Prepare Data

## Load Data

In [3]:
big_matrix_final = pd.read_parquet(FEATURES_PATH + "big_matrix_final.parquet")
big_matrix_final = big_matrix_final.drop(columns=["time", "user_id", "video_id"])

small_matrix_final = pd.read_parquet(FEATURES_PATH + "small_matrix_final.parquet")
small_matrix_final = small_matrix_final.drop(columns=["time", "user_id", "video_id"])

big_matrix_final

Unnamed: 0,watch_ratio,watch_ratio_mean,watch_ratio_prior_mean
0,2.555187,1.109535,0.000000
1,0.595189,0.938616,0.000000
2,1.937264,0.938616,0.000000
3,0.606315,0.938616,0.000000
4,0.796302,0.938616,0.000000
...,...,...,...
9729356,1.183844,0.800965,0.986430
9729357,0.197555,0.800965,0.789548
9729358,1.313025,0.800965,1.144480
9729359,0.276921,0.800965,0.908206


## Split data into X_train / y_train / X_test / y_test

In [4]:
def split_X_y(df: pd.DataFrame):
    y = df["watch_ratio"]
    tmp = df.drop(columns=["watch_ratio"])
    X = tmp[[c for c in tmp.columns]]
    return X, y

def split_all(df: pd.DataFrame):
    train, test = big_matrix_final, small_matrix_final
    X_train, y_train = split_X_y(train)
    X_test, y_test = split_X_y(test)
    test_watch_ratio_mean = test["watch_ratio_mean"]
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), test_watch_ratio_mean

In [5]:
X_train, y_train, X_test, y_test, test_watch_ratio_mean = split_all(big_matrix_final)

# AI

## Create the model

In [6]:
model = Sequential([
    Input(shape=(2,)),

    # Block 1
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    # Output layer: sigmoid in [0,1], scaled to [0,5]
    Dense(1, activation='sigmoid'),
    Lambda(lambda x: x * 5.0)
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mae']
)

## Train the model

In [7]:
#sample_weight = 1 + (y_train > 3) * 10

In [8]:
model.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=1024,
    #sample_weight=sample_weight,
    #class_weight=class_weights_dict,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - loss: 0.5020 - mae: 0.4822 - val_loss: 0.3929 - val_mae: 0.4175
Epoch 2/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 0.3992 - mae: 0.4250 - val_loss: 0.3925 - val_mae: 0.4177
Epoch 3/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 0.3982 - mae: 0.4240 - val_loss: 0.3928 - val_mae: 0.4159
Epoch 4/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 0.3979 - mae: 0.4238 - val_loss: 0.3923 - val_mae: 0.4167
Epoch 5/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 0.3979 - mae: 0.4236 - val_loss: 0.3926 - val_mae: 0.4172
Epoch 6/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 2ms/step - loss: 0.3979 - mae: 0.4237 - val_loss: 0.3927 - val_mae: 0.4168
Epoch 7/10
[1m8552/8552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

<keras.src.callbacks.history.History at 0x7de4e56ba790>

## Predict video watch_ratio and compare it to the baseline

In [9]:
baseline = np.full_like(y_test, np.mean(y_test), dtype=np.float32)
preds = model.predict(X_test, batch_size=2048, verbose=None)
mae_baseline = np.mean(np.abs(baseline - y_test))
mae_model = np.mean(np.abs(preds.flatten() - y_test))
print(f"Baseline MAE: {mae_baseline}")
print(f"Actual MAE: {mae_model}")

Baseline MAE: 0.42254011584171625
Actual MAE: 0.3124082473289498


## Compute another, better baseline

In [10]:
np.mean(np.abs(y_test.flatten() - test_watch_ratio_mean))

0.4085236172713595

# Save the model

In [11]:
model.save(FEATURES_PATH + "ai_model.keras")