## Imports

In [1]:
import pandas as pd
import os
from sklearn.preprocessing import RobustScaler
import joblib
import numpy as np

## Loading the data

In [2]:
export_dir = "./exports/feature_engineered_data/"
big_matrix_cleaned = pd.read_parquet(export_dir + "../cleaned_data/big_matrix_cleaned.pq")
user_df = pd.read_parquet(export_dir + "/user_df.pq")
video_df = pd.read_parquet(export_dir + "/video_df.pq")

## Introduction

We will now proceed to create our training dataset. For that, we will load our cleaned interaction matrix (big_matrix) and, for each interaction, we will take the corresponding previously engineered user and video vectors. We will also save the "watch_ratio" of the interaction which will be our "y" value to predict.

## Train dataset creation

### Step 1: Merging everything on our interaction matrix (big matrix)

In [3]:
big_matrix_cleaned = big_matrix_cleaned.drop(columns=["video_duration"])
big_matrix_cleaned['date'] = pd.to_datetime(big_matrix_cleaned['time']).dt.date
big_matrix_cleaned = big_matrix_cleaned.drop(columns=["time"])
big_matrix_cleaned = big_matrix_cleaned.merge(user_df, on='user_id', how='left')
video_df['date'] = pd.to_datetime(video_df['date']).dt.date
big_matrix_cleaned = big_matrix_cleaned.merge(video_df, on=['video_id', 'date'], how='left')
big_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,date,avg_feat_0,avg_feat_1,avg_feat_2,avg_feat_3,avg_feat_4,avg_feat_5,...,category_30,category_31,category_32,category_33,category_34,category_35,category_36,category_37,category_38,category_39
0,0,3649,1.273397,2020-07-05,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,...,0,0,0,0,0,0,0,0,0,0
1,0,5262,0.107613,2020-07-05,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,...,0,0,0,0,0,0,0,0,0,0
2,0,1963,1.434307,2020-07-05,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,...,0,0,0,0,0,0,0,0,0,0
3,0,8234,1.296455,2020-07-05,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,...,0,0,0,0,0,0,0,0,0,0
4,0,8228,3.113806,2020-07-05,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9486371,7175,6597,1.004462,2020-09-05,0.734236,0.744769,1.170560,0.000000,0.753070,0.953051,...,0,0,0,0,0,0,0,0,0,0
9486372,7175,6630,0.313389,2020-09-05,0.734236,0.744769,1.170560,0.000000,0.753070,0.953051,...,0,0,0,0,0,0,0,0,0,0
9486373,7175,10360,1.253997,2020-09-06,0.734236,0.744769,1.170560,0.000000,0.753070,0.953051,...,0,0,0,0,0,0,0,0,0,0
9486374,7175,10360,1.253997,2020-09-06,0.734236,0.744769,1.170560,0.000000,0.753070,0.953051,...,0,0,0,0,0,0,0,0,0,0


### Step 2: Extracting and scaling our features

For each row (interaction), we will be taking out:
- A vector containing our engineered user features.
- A vector containing our engineered video features.
- The watch ratio value

The goal being the creation of a dataset to train a neural network, we have to scale our features. For that, we will be using scikit-learn's RobustScaler. It is a well-known scaler which is robust to outliers.

Also, we will scale independently our user and video features, so we create 2 different scalers for that.

In [4]:
user_feature_cols = [col for col in user_df.columns if col not in ('user_id')]
video_feature_cols = [col for col in video_df.columns if col not in ('video_id', 'date')]

def extract_the_training_features() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X_user_train = big_matrix_cleaned[user_feature_cols].copy()
    X_video_train = big_matrix_cleaned[video_feature_cols].copy()
    y_train = pd.DataFrame(big_matrix_cleaned['watch_ratio'])

    return X_user_train, X_video_train, y_train

X_user_train, X_video_train, y_train = extract_the_training_features()
user_scaler, video_scaler = RobustScaler(), RobustScaler()

X_user_train[user_feature_cols] = user_scaler.fit_transform(X_user_train[user_feature_cols])
X_video_train[["video_duration", "trend_score"]] = video_scaler.fit_transform(X_video_train[["video_duration", "trend_score"]])

## Saving the data

For memory concerns, we save the data as numpy arrays.

We also save our two scalers alongside the data in order to use them to scale the test data. For consistency, it is better if we use the same pre-trained scalers.

In [6]:
export_dir = "./exports/train_data/"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)

np.save(export_dir + "X_user_train.npy", X_user_train.to_numpy())
np.save(export_dir + "X_video_train.npy", X_video_train.to_numpy())
np.save(export_dir + "y_train.npy", y_train.to_numpy())

export_dir = "./exports/scalers/"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)

joblib.dump(user_scaler, export_dir + "user_scaler.pkl")
joblib.dump(video_scaler, export_dir + "video_scaler.pkl")

['./exports/scalers/video_scaler.pkl']