# Import necessary libraries

In [1]:
# Data imports
import pandas as pd
import numpy as np

# AI imports
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda

from sklearn.preprocessing import RobustScaler

2025-05-17 22:08:38.102445: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Prepare environment

In [2]:
SEED = 42
tf.random.set_seed(SEED)

TRAIN_TEST_SPLIT = 0.8
FEATURES_PATH = "data/features/"

# Prepare Data

## Load Data

In [3]:
scaler = RobustScaler()

big_matrix_final = pd.read_parquet(FEATURES_PATH + "big_matrix_final.parquet")
black_list = ["user_id", "video_id", "watch_ratio"]
to_scale = [col for col in big_matrix_final.columns if col not in black_list]
big_matrix_final[to_scale] = scaler.fit_transform(big_matrix_final[to_scale])
big_matrix_final = big_matrix_final.drop(columns=["user_id", "video_id"])


small_matrix_final = pd.read_parquet(FEATURES_PATH + "small_matrix_final.parquet")
small_matrix_final[to_scale] = scaler.transform(small_matrix_final[to_scale])
small_matrix_final.to_parquet(FEATURES_PATH + "small_matrix_scaled.parquet")
small_matrix_final = small_matrix_final.drop(columns=["user_id", "video_id"])



big_matrix_final

Unnamed: 0,watch_ratio,video_duration,user_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,1.273397,0.336768,0.508628,-0.196588,0.311543,0.432870
1,0.107613,-0.356206,0.508628,0.452844,0.390483,0.117374
2,1.434307,0.037705,0.508628,0.213114,0.438810,0.432870
3,1.296455,0.367916,0.508628,-0.221446,-0.291453,-0.128065
4,3.113806,-0.199766,0.508628,0.536528,0.407708,0.272438
...,...,...,...,...,...,...
8591372,0.174968,1.962529,-0.153850,-0.518401,0.103067,0.374840
8591373,1.004462,-0.161124,-0.153850,-0.243543,-0.158212,-0.055479
8591374,0.313389,1.036534,-0.153850,-0.576021,0.161666,0.007535
8591375,1.253997,-0.553162,-0.153850,0.413159,-0.158212,-0.055479


## Split data into X_train / y_train / X_test / y_test

In [4]:
def split_X_y(df: pd.DataFrame):
    y = df["watch_ratio"]
    tmp = df.drop(columns=["watch_ratio"])
    X = tmp[[c for c in tmp.columns]]
    return X, y

def split_all(df: pd.DataFrame):
    train, test = big_matrix_final, small_matrix_final
    X_train, y_train = split_X_y(train)
    X_test, y_test = split_X_y(test)
    test_watch_ratio_mean = test["video_watch_ratio_mean"]
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test), test_watch_ratio_mean

In [5]:
X_train, y_train, X_test, y_test, test_watch_ratio_mean = split_all(big_matrix_final)

# AI

## Create the model

In [6]:
model = Sequential([
    Input(shape=(5,)),
    
    # Block 1
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(1, activation='sigmoid'),
    # Scale output to [0, 5] to match 
    Lambda(lambda x: x * 5.0)
])

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

## Train the model

In [7]:
model.fit(
    X_train,
    y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=1024,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/5
[1m7552/7552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 0.6121 - mae: 0.5339 - val_loss: 0.4568 - val_mae: 0.4588
Epoch 2/5
[1m7552/7552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 0.4551 - mae: 0.4609 - val_loss: 0.4563 - val_mae: 0.4586
Epoch 3/5
[1m7552/7552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 0.4538 - mae: 0.4597 - val_loss: 0.4561 - val_mae: 0.4586
Epoch 4/5
[1m7552/7552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 0.4535 - mae: 0.4594 - val_loss: 0.4561 - val_mae: 0.4590
Epoch 5/5
[1m7552/7552[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 0.4534 - mae: 0.4594 - val_loss: 0.4560 - val_mae: 0.4590


<keras.src.callbacks.history.History at 0x74b04458b490>

## Predict video watch_ratio and compare it to the baseline

The baseline is the average watch_ratio of all videos.\
We can see that the MAE of my model is arounf 25% better than the baseline.

In [8]:
baseline = np.full_like(y_test, np.mean(y_test), dtype=np.float32)
preds = model.predict(X_test, batch_size=2048, verbose=None)
mae_baseline = np.mean(np.abs(baseline - y_test))
mae_model = np.mean(np.abs(preds.flatten() - y_test))
print(f"Baseline MAE: {mae_baseline}")
print(f"Actual MAE: {mae_model}")

Baseline MAE: 0.4202214045614679
Actual MAE: 0.37182253271619287


# Save the model

In [9]:
model.save(FEATURES_PATH + "ai_model.keras")