## Introduction

Since we will implement content-based filtering using a two-tower neural network to build our recommender system, we must create user and video vectors for each interaction in our dataset.

We will only use the interactions from big_matrix to do so, because these represent our train data.

## Imports

In [1]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Loading the data

In [2]:
export_dir = "./exports/cleaned_data/"
big_matrix_cleaned = pd.read_parquet(export_dir + "big_matrix_cleaned.pq")
item_categories_cleaned = pd.read_parquet(export_dir + "item_categories_cleaned.pq")
item_daily_features_cleaned = pd.read_parquet(export_dir + "item_daily_features_cleaned.pq")
caption_category_cleaned = pd.read_parquet(export_dir + "caption_category_cleaned.pq")

## Engineering the user vectors

Here is what we want to have in our user vectors to represent each of our users:
- For each video feat, we want to know what is the average watch ratio of the user.
- Same as for video feats, but for the first level caption categories. We want to know what is the average watch ratio of the user for each category.

This will enable us to create a vector representation of our users.

### Step 1: Average watch ratio per video feat

We will explode our originally multi-labelled feature (it is originally a list of feat ids for each video) and compute the average watch ratio for each of this video features, for each one of our users.

In [3]:
def get_user_avg_feat_df(df: pd.DataFrame) -> pd.DataFrame:
    user_avg_feat_df = df[["user_id", "video_id", "watch_ratio"]].copy()
    user_avg_feat_df = user_avg_feat_df.merge(item_categories_cleaned, on="video_id", how="left")
    user_avg_feat_df = user_avg_feat_df.explode(column="feat")
    user_avg_feat_df['avg_feat'] = user_avg_feat_df.groupby(['user_id', 'feat'])['watch_ratio'].transform('mean').fillna(0)
    user_avg_feat_df = user_avg_feat_df[["user_id", "feat", "avg_feat"]].drop_duplicates()

    user_avg_feat_df = user_avg_feat_df.sort_values(["user_id", "feat"]).reset_index(drop=True)
    user_avg_feat_df = user_avg_feat_df.pivot(index='user_id', columns='feat', values='avg_feat')
    user_avg_feat_df.columns = [f'avg_feat_{int(col)}' for col in user_avg_feat_df.columns]
    user_avg_feat_df = user_avg_feat_df.fillna(0)

    return user_avg_feat_df.reset_index()

user_avg_feat_df = get_user_avg_feat_df(big_matrix_cleaned)
user_avg_feat_df

Unnamed: 0,user_id,avg_feat_0,avg_feat_1,avg_feat_2,avg_feat_3,avg_feat_4,avg_feat_5,avg_feat_6,avg_feat_7,avg_feat_8,...,avg_feat_21,avg_feat_22,avg_feat_23,avg_feat_24,avg_feat_25,avg_feat_26,avg_feat_27,avg_feat_28,avg_feat_29,avg_feat_30
0,0,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,0.953513,1.222933,1.256774,...,1.092595,0.229430,1.172116,2.434744,1.211370,1.286180,1.231113,1.201737,1.857724,0.485446
1,1,0.365590,0.938559,0.321120,0.183951,1.644048,1.002701,0.970365,1.167777,1.055518,...,1.210433,0.000000,0.102452,0.000000,0.854879,1.271705,2.134150,1.177848,1.434024,1.488989
2,2,0.376160,0.756911,0.481671,0.558891,0.910193,0.727867,0.547837,0.721542,0.704722,...,1.013190,0.000000,0.771147,0.000000,0.633928,0.737108,0.210522,0.674988,0.176768,0.079384
3,3,1.693144,1.157402,0.953552,0.796545,1.253424,1.325727,0.972882,1.212054,1.254596,...,0.897935,0.166655,0.529580,0.000000,1.088320,0.985220,0.623670,1.164498,1.561361,0.919145
4,4,0.000000,0.476605,1.010854,0.000000,0.455023,0.378779,0.492796,0.924956,0.746931,...,0.800958,0.000000,0.000000,0.000000,1.096496,0.988842,0.000000,1.038249,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6894,7171,0.603449,0.938208,0.788680,0.526626,0.769178,1.139644,0.737299,0.968608,0.787917,...,0.279137,0.019471,0.416539,0.000000,0.953211,0.872436,0.846472,0.985410,0.181914,0.681021
6895,7172,1.166396,1.027942,0.966537,0.548050,0.748462,1.159014,1.037559,1.162223,1.401072,...,0.989900,0.058764,1.305065,0.000000,1.053001,1.237537,2.076483,1.282114,0.000000,0.522903
6896,7173,0.293560,0.304803,1.743508,0.000000,0.389291,0.484452,0.669303,0.792887,0.609157,...,0.000000,0.000000,0.441495,0.000000,0.885664,0.771188,0.000000,0.889938,0.000000,0.585139
6897,7174,1.071403,0.710077,0.555549,1.249871,0.932450,0.869710,0.618042,0.873835,0.893528,...,0.934342,0.083650,3.462550,0.000000,0.878827,0.826436,0.993900,0.976525,0.831416,0.772563


### Step 2: Average watch ratio per video caption category

We will perform the same process as for the Step 1 in order to compute the average watch ratio of the user for each first level caption category a video might have. 

In [4]:
def get_user_avg_category_df(df: pd.DataFrame) -> pd.DataFrame:
    user_avg_category_df = df.copy()
    user_avg_category_df = user_avg_category_df.merge(caption_category_cleaned, on="video_id", how="left")
    user_avg_category_df = user_avg_category_df.explode(column="first_level_category_id")
    user_avg_category_df['avg_category'] = user_avg_category_df.groupby(['user_id', 'first_level_category_id'])['watch_ratio'].transform('mean').fillna(0)
    user_avg_category_df = user_avg_category_df[["user_id", "first_level_category_id", "avg_category"]].drop_duplicates()

    user_avg_category_df = user_avg_category_df.sort_values(["user_id", "first_level_category_id"]).reset_index(drop=True)
    user_avg_category_df = user_avg_category_df.pivot(index='user_id', columns='first_level_category_id', values='avg_category')
    user_avg_category_df.columns = [f'avg_category_{int(col)}' for col in user_avg_category_df.columns]
    user_avg_category_df = user_avg_category_df.fillna(0)

    return user_avg_category_df.reset_index()

user_avg_category_df = get_user_avg_category_df(big_matrix_cleaned)
user_avg_category_df

Unnamed: 0,user_id,avg_category_1,avg_category_2,avg_category_3,avg_category_4,avg_category_5,avg_category_6,avg_category_7,avg_category_8,avg_category_9,...,avg_category_30,avg_category_31,avg_category_32,avg_category_33,avg_category_34,avg_category_35,avg_category_36,avg_category_37,avg_category_38,avg_category_39
0,0,1.192098,1.027106,0.949826,0.928717,1.036108,1.023411,1.259680,1.307876,1.115586,...,1.410021,1.202423,0.889483,1.284525,1.180908,1.023252,0.876753,0.836855,0.586343,1.098977
1,1,1.022935,0.374669,0.053281,0.993688,1.210519,1.105509,1.205432,1.084341,0.953741,...,1.234668,1.168085,0.633147,0.636925,0.778177,1.285486,1.058875,1.489968,3.278239,0.265493
2,2,0.775209,0.345140,0.558891,0.909273,0.735444,0.739852,0.774006,0.809566,0.637553,...,1.428702,0.539263,0.483235,0.678805,0.624489,0.581113,0.464605,0.446583,0.000000,0.000000
3,3,1.138911,0.985986,0.788544,1.396359,1.204804,0.992594,1.121616,1.322698,1.189260,...,1.032950,1.396003,0.471420,1.055098,1.246648,1.175796,0.868595,0.550652,1.224354,1.503752
4,4,0.454793,1.426055,0.000000,0.455023,0.511590,0.405792,1.072124,0.749654,1.301180,...,0.474285,0.354229,0.347899,0.864947,0.748424,0.411726,0.000000,0.290513,0.000000,0.060355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6894,7171,0.959791,0.212285,0.334050,0.821813,1.177728,0.858873,1.012336,0.765917,0.846581,...,0.480705,0.676469,0.521729,1.210908,0.762554,1.052949,0.168634,0.207517,0.000000,0.038046
6895,7172,1.094386,0.537169,0.582340,0.858115,1.313540,1.169087,1.150609,1.434341,0.885217,...,0.515929,1.037156,0.926841,1.039832,1.318835,1.672876,1.032393,1.251651,0.813821,0.206309
6896,7173,0.326512,0.523244,0.000000,0.500975,0.448853,0.855716,1.310657,0.252676,0.578412,...,0.000000,0.482575,0.327401,2.141854,1.057960,0.540004,0.559333,0.196024,0.309758,0.019076
6897,7174,0.725369,0.523844,1.340178,1.048326,0.895532,0.750023,0.881982,0.969229,0.785818,...,0.976515,0.883538,0.317697,0.958845,0.859775,1.007326,0.247095,0.609666,2.318234,0.438914


### Step 3: Merge everything to create our user vectors

In [5]:
user_df = user_avg_feat_df.copy().merge(user_avg_category_df, on="user_id", how="left")
user_df

Unnamed: 0,user_id,avg_feat_0,avg_feat_1,avg_feat_2,avg_feat_3,avg_feat_4,avg_feat_5,avg_feat_6,avg_feat_7,avg_feat_8,...,avg_category_30,avg_category_31,avg_category_32,avg_category_33,avg_category_34,avg_category_35,avg_category_36,avg_category_37,avg_category_38,avg_category_39
0,0,1.302471,1.088432,0.914949,0.902536,0.768437,1.087083,0.953513,1.222933,1.256774,...,1.410021,1.202423,0.889483,1.284525,1.180908,1.023252,0.876753,0.836855,0.586343,1.098977
1,1,0.365590,0.938559,0.321120,0.183951,1.644048,1.002701,0.970365,1.167777,1.055518,...,1.234668,1.168085,0.633147,0.636925,0.778177,1.285486,1.058875,1.489968,3.278239,0.265493
2,2,0.376160,0.756911,0.481671,0.558891,0.910193,0.727867,0.547837,0.721542,0.704722,...,1.428702,0.539263,0.483235,0.678805,0.624489,0.581113,0.464605,0.446583,0.000000,0.000000
3,3,1.693144,1.157402,0.953552,0.796545,1.253424,1.325727,0.972882,1.212054,1.254596,...,1.032950,1.396003,0.471420,1.055098,1.246648,1.175796,0.868595,0.550652,1.224354,1.503752
4,4,0.000000,0.476605,1.010854,0.000000,0.455023,0.378779,0.492796,0.924956,0.746931,...,0.474285,0.354229,0.347899,0.864947,0.748424,0.411726,0.000000,0.290513,0.000000,0.060355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6894,7171,0.603449,0.938208,0.788680,0.526626,0.769178,1.139644,0.737299,0.968608,0.787917,...,0.480705,0.676469,0.521729,1.210908,0.762554,1.052949,0.168634,0.207517,0.000000,0.038046
6895,7172,1.166396,1.027942,0.966537,0.548050,0.748462,1.159014,1.037559,1.162223,1.401072,...,0.515929,1.037156,0.926841,1.039832,1.318835,1.672876,1.032393,1.251651,0.813821,0.206309
6896,7173,0.293560,0.304803,1.743508,0.000000,0.389291,0.484452,0.669303,0.792887,0.609157,...,0.000000,0.482575,0.327401,2.141854,1.057960,0.540004,0.559333,0.196024,0.309758,0.019076
6897,7174,1.071403,0.710077,0.555549,1.249871,0.932450,0.869710,0.618042,0.873835,0.893528,...,0.976515,0.883538,0.317697,0.958845,0.859775,1.007326,0.247095,0.609666,2.318234,0.438914


We now successfuly created a vector representation for each of our unique 6899 users, containing each 70 engineered features.

## Engineering the video vectors

The video vectors are a bit more complicated to engineer because we want:
- A vector encoding for each video feat (1 if the video has it, 0 otherwise)
- A vector encoding for each first level caption category id (1 if the video's caption has this id, 0 otherwise)
- The video duration
- A trending score

The trending score is the hard part, because we will compute a trend score for the last 7 days rolling prior to the interaction time. This means each video will have different vector representations depending on the time of the interaction.

### Step 1: Video feat vector encoding

In [6]:
def get_video_feat_df(df: pd.DataFrame) -> pd.DataFrame:
    video_feat_df = df.copy()
    video_feat_df = video_feat_df.explode(column="feat")
    video_feat_df = pd.crosstab(video_feat_df["video_id"], video_feat_df["feat"])
    video_feat_df.columns = [f'feat_{int(col)}' for col in video_feat_df.columns]
    return video_feat_df

video_feat_df = get_video_feat_df(item_categories_cleaned)
video_feat_df

Unnamed: 0_level_0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_21,feat_22,feat_23,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10722,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10724,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Step 2: Video caption first level category vector encoding 

In [7]:
def get_video_category_df(df: pd.DataFrame) -> pd.DataFrame:
    video_category_df = df.copy()
    video_category_df = pd.crosstab(video_category_df["video_id"], video_category_df["first_level_category_id"])
    video_category_df.columns = [f'category_{int(col)}' for col in video_category_df.columns]
    return video_category_df

video_category_df = get_video_category_df(caption_category_cleaned)
video_category_df

Unnamed: 0_level_0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,category_9,category_10,...,category_30,category_31,category_32,category_33,category_34,category_35,category_36,category_37,category_38,category_39
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10722,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10724,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Step 3: Video duration and 7 days rolling trend score

In order to compute our 7 days rolling trend score, we will:
- Compute a weighted engagement score for each video based on interactions nature and count (for example, we weighted the score in order to give more value to a follower count increase than to a like).
- Apply a log10 to reduce skew.
- Fill in missing dates so we have data for each day for each video. Originally, we do not have an entry every day for each video, so we add some while assuming the video had no interactions for the missing days.
- Mask the scores before the upload date, so we do not dilude our engagement score just after the date of a video upload.
- Compute rolling the 7 days rolling trend score as a mean of the last 7 calendar days (not including the current day).

For the video duration feature, we just have to fill the value in the days we added.

In [8]:
def compute_engagement_score(df: pd.DataFrame) -> pd.DataFrame:
    weights = {"valid_play_cnt": 0.1, "like_cnt": 0.2, "comment_cnt": 0.3, "share_cnt": 0.5, "follow_cnt": 1.0, "collect_cnt": 0.5, "download_cnt": 0.5}
    df["score"] = sum(df[col] * weight for col, weight in weights.items())
    df.drop(columns=weights.keys(), inplace=True)
    df["score"] = np.log10(df["score"])
    df["score"].replace(-np.inf, np.nan, inplace=True)
    return df

def fill_missing_dates(df: pd.DataFrame, start="2020-06-23", end="2020-09-10") -> pd.DataFrame:
    full_index = pd.MultiIndex.from_product([df["video_id"].unique(), pd.date_range(start, end)], names=["video_id", "date"])
    df = df.set_index(["video_id", "date"]).reindex(full_index).reset_index()
    return df

def mask_score_before_upload(df: pd.DataFrame) -> pd.DataFrame:
    df["upload_dt"] = df.groupby("video_id")["upload_dt"].transform(lambda x: x.ffill().bfill())
    df["score_masked"] = df.apply(lambda row: row["score"] if row["date"] >= row["upload_dt"] else np.nan, axis=1)
    df["score"] = df["score"].fillna(0)
    return df

def compute_rolling_trend(group):
    group = group.sort_values("date").copy()
    upload_dt = group["upload_dt"].iloc[0]

    group = group.set_index("date")
    valid = group.index > upload_dt
    trend = pd.Series(np.nan, index=group.index)

    trend[valid] = group["score_masked"][valid].shift(1).rolling("7D", min_periods=1).mean()
    group["trend_score"] = trend

    return group.reset_index()[["video_id", "date", "trend_score"]]

def compute_trend(df: pd.DataFrame) -> pd.DataFrame:
    trend_scores = df.groupby("video_id", group_keys=False).apply(compute_rolling_trend).reset_index(drop=True).fillna(0)
    return trend_scores

def finalize_df(df, trend_scores):
    df = pd.merge(df, trend_scores, on=["video_id", "date"], how="left")
    df["video_duration"] = df.groupby("video_id")["video_duration"].transform(lambda x: x.ffill().bfill())
    df = df.drop(columns=["upload_dt", "score", "score_masked"])
    df = df.sort_values(["video_id", "date"])
    return df

def process_video_dataframe() -> pd.DataFrame:
    df = item_daily_features_cleaned.copy()
    df = compute_engagement_score(item_daily_features_cleaned)
    df = fill_missing_dates(df)
    df = mask_score_before_upload(df)
    trend_scores = compute_trend(df)
    df = finalize_df(df, trend_scores)
    return df

video_df = process_video_dataframe()
video_df

Unnamed: 0,video_id,date,video_duration,trend_score
0,0,2020-06-23,5966.0,0.0
1,0,2020-06-24,5966.0,0.0
2,0,2020-06-25,5966.0,0.0
3,0,2020-06-26,5966.0,0.0
4,0,2020-06-27,5966.0,0.0
...,...,...,...,...
700315,10727,2020-09-06,5666.0,0.0
700316,10727,2020-09-07,5666.0,0.0
700317,10727,2020-09-08,5666.0,0.0
700318,10727,2020-09-09,5666.0,0.0


### Step 4: Merge everything to create our video vectors

In [9]:
video_df = video_df.merge(video_feat_df, on="video_id", how="left")
video_df = video_df.merge(video_category_df, on="video_id", how="left")
video_df

Unnamed: 0,video_id,date,video_duration,trend_score,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,...,category_30,category_31,category_32,category_33,category_34,category_35,category_36,category_37,category_38,category_39
0,0,2020-06-23,5966.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2020-06-24,5966.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2020-06-25,5966.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2020-06-26,5966.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,2020-06-27,5966.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700315,10727,2020-09-06,5666.0,0.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
700316,10727,2020-09-07,5666.0,0.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
700317,10727,2020-09-08,5666.0,0.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
700318,10727,2020-09-09,5666.0,0.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


We now successfuly created a vector representation for each of our videos and so for each possible day in our dataset.

## Saving the dataframes

In [10]:
export_dir = "./exports/feature_engineered_data/"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
user_df.to_parquet(export_dir + "user_df.pq")
video_df.to_parquet(export_dir + "video_df.pq")