Notes for myself:
- maybe add the upload_dt column when taking columns from item_daily_features to be able to discriminate videos posted a while ago

In [1]:
"""%%bash
wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470
' -O KuaiRec.zip
unzip KuaiRec.zip -d data_final_project"""

"%%bash\nwget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470\n' -O KuaiRec.zip\nunzip KuaiRec.zip -d data_final_project"

# Imports

In [2]:
SEED = 69
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from typing import Tuple, Dict
import plotly.express as px
import plotly.figure_factory as ff
from scipy import stats
import ast
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Lambda
import tensorflow.keras.backend as K
#######################################
np.random.seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

2025-05-13 01:36:40.208010: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747093000.218278  233213 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747093000.221533  233213 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747093000.230204  233213 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747093000.230211  233213 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1747093000.230212  233213 computation_placer.cc:177] computation placer alr

# Loading the data

In [3]:
root = "data_final_project/KuaiRec 2.0/data/"
small_matrix = pd.read_csv(root + "small_matrix.csv").drop_duplicates().dropna()
social_network = pd.read_csv(root + "social_network.csv").drop_duplicates().dropna()
user_features = pd.read_csv(root + "user_features.csv").drop_duplicates().dropna()
item_daily_features = pd.read_csv(root + "item_daily_features.csv").drop_duplicates().dropna()
item_categories = pd.read_csv(root + "item_categories.csv").drop_duplicates().dropna()
caption_category = pd.read_csv(root + "kuairec_caption_category.csv", lineterminator='\n').drop_duplicates().dropna()
# There is a category id equal to -124 and no category being 30, so I set the negative one to be 30 for practical reasons
caption_category['first_level_category_id'] = [x if x >= 0 else 30 for x in caption_category['first_level_category_id']]

# Data Cleaning

##### Cleaning steps:
- small/big_matrix: watch_ratio (we remove unrealistic ones)
- social_network: change friend list to actual list type (not str), add a column for friends count
- item_categories: we explode feats and encode them
- item_daily_features: we keep only NORMAL videos because we do not want to recommend ads, we keep only videos which are in public visible status because we can't recommend a private video
- user_features: good as-is
- caption_category: good as-is

In [4]:
known_user_ids = set(user_features["user_id"].unique())
known_video_ids = set(caption_category["video_id"].unique()) & set(item_categories["video_id"].unique()) & set(item_daily_features["video_id"].unique())
print(f"We have data for {len(known_user_ids)} users and {len(known_video_ids)} videos")

We have data for 6899 users and 8846 videos


#### small_matrix:

In [None]:
small_matrix_cleaned = small_matrix.copy()
small_matrix_cleaned = small_matrix_cleaned.drop(columns=["play_duration", "video_duration", "date", "time"])
# We remove duplicates, null values and filter out unrealistic watch_ratio values
small_matrix_cleaned = small_matrix_cleaned.dropna().drop_duplicates()
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned.loc[:, "watch_ratio"] < 5]
# We remove interactions with unknown users or unknown videos
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned.loc[:, "user_id"].isin(known_user_ids)]
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned.loc[:, "video_id"].isin(known_video_ids)]
# We convert times to actual times
small_matrix_cleaned["timestamp"] = pd.to_datetime(small_matrix_cleaned["timestamp"], unit="s", errors="coerce")
assert(not small_matrix_cleaned.isnull().sum().any())
small_matrix_cleaned

Unnamed: 0,user_id,video_id,timestamp,watch_ratio
0,14,148,2020-07-04 21:27:48.378000021,0.722103
1,14,183,2020-07-04 21:28:00.056999922,1.907377
2,14,3649,2020-07-04 21:29:09.479000092,2.063311
3,14,5262,2020-07-04 21:30:43.285000086,0.566388
4,14,8234,2020-07-04 21:35:43.459000111,0.418364
...,...,...,...,...
4676370,7162,9177,2020-09-01 12:06:35.983999968,0.142857
4676371,7162,4987,2020-09-02 06:44:51.342000008,1.234848
4676372,7162,7988,2020-09-03 00:45:01.473999977,1.024412
4676373,7162,6533,2020-09-04 14:56:32.020999908,0.273750


In [6]:
earliest_date = small_matrix_cleaned["timestamp"].min()
latest_date = small_matrix_cleaned["timestamp"].max()
print(f"Our date ranges from {earliest_date} to {latest_date}")
train_test_dates = ["2020-07-03", "2020-08-20"]
print(f"We will split our training and testing sets on the {train_test_dates[1]} to keep data temporality")
print(len(small_matrix_cleaned[(small_matrix_cleaned["timestamp"] < train_test_dates[1])]))
print(len(small_matrix_cleaned[(small_matrix_cleaned["timestamp"] >= train_test_dates[1])]))

Our date ranges from 2020-07-03 18:23:26.059999943 to 2020-09-05 15:57:23.683000088
We will split our training and testing sets on the 2020-08-20 to keep data temporality
3378814
449464


#### social_network:

In [7]:
social_network_cleaned = social_network.copy()
# We remove unknown users
social_network_cleaned = social_network_cleaned[social_network_cleaned.loc[:, "user_id"].isin(known_user_ids)]
# Convert string to actual list of ints
social_network_cleaned["friend_list"] = social_network_cleaned["friend_list"].apply(ast.literal_eval)
# Add friend_count row
social_network_cleaned["friend_count"] = social_network_cleaned["friend_list"].apply(lambda x : len(x))
# Add users with no friends for consistency
missing_ids = [id for id in known_user_ids if id not in social_network_cleaned["user_id"].unique()]
missing_users_social = pd.DataFrame({"user_id": missing_ids, "friend_list": [[] for _ in range(len(missing_ids))], "friend_count": [0] * len(missing_ids)})
social_network_cleaned = pd.concat([social_network_cleaned, missing_users_social], ignore_index=True)
assert(len(social_network_cleaned) == len(known_user_ids))
social_network_cleaned

Unnamed: 0,user_id,friend_list,friend_count
0,3371,[2975],1
1,24,[2665],1
2,4402,[38],1
3,4295,[4694],1
4,7087,[7117],1
...,...,...,...
6894,7170,[],0
6895,7171,[],0
6896,7172,[],0
6897,7173,[],0


#### item_categories:

In [8]:
item_categories_cleaned = item_categories.copy()
item_categories_cleaned = item_categories_cleaned[item_categories_cleaned.loc[:, "video_id"].isin(known_video_ids)]
item_categories_cleaned["feat"] = item_categories_cleaned["feat"].apply(ast.literal_eval)
item_categories_cleaned = item_categories_cleaned.explode("feat")
# Create the vector encoding
item_categories_cleaned = item_categories_cleaned.assign(present=1).pivot(index="video_id", columns="feat", values="present").fillna(0).reindex(columns=sorted(item_categories_cleaned["feat"].unique())).astype(int)
item_categories_cleaned.columns = [f"feat_{feat}" for feat in item_categories_cleaned.columns]
item_categories_cleaned = item_categories_cleaned.reset_index()
assert(len(item_categories_cleaned["video_id"]) == len(known_video_ids))
item_categories_cleaned

Unnamed: 0,video_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_21,feat_22,feat_23,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8841,10722,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8842,10723,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8843,10724,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8844,10726,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### item_daily_features:

In [9]:
item_daily_features_cleaned = item_daily_features.copy()
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["visible_status"] == "public"]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["video_type"] == "NORMAL"]
item_daily_features_cleaned["date"] = pd.to_datetime(item_daily_features_cleaned["date"].astype(str), format="%Y%m%d", errors="coerce")

def golden_item_daily_features(df: pd.DataFrame) -> pd.DataFrame:
    useful_cols = ["video_id", "video_tag_id", "video_duration", "show_cnt", "valid_play_cnt", "play_progress","like_cnt", "comment_cnt", "share_cnt", "follow_cnt", "collect_cnt", "download_cnt"]
    useful_cols += ["cancel_like_cnt", "delete_comment_cnt", "cancel_follow_cnt", "cancel_collect_cnt", "date"]
    df = df[useful_cols]
    df = df.groupby("video_id").agg({
        "video_tag_id" : lambda x : x.iloc[0],
        "video_duration" : lambda x : int(x.iloc[0]), 
        "show_cnt" : "sum",
        "valid_play_cnt" : "sum",
        "play_progress" : "mean",
        "like_cnt" : "sum",
        "comment_cnt" : "sum",
        "share_cnt" : "sum",
        "follow_cnt" : "sum",
        "collect_cnt" : lambda x : int(x.sum()),
        "download_cnt" : "sum",
        ###########################
        # TEST
        ###########################
        "cancel_like_cnt" : "sum",
        "delete_comment_cnt" : "sum",
        "cancel_follow_cnt" : "sum",
        "cancel_collect_cnt" : "sum",
        "date" : "first"
    }).reset_index()
    return df

train_item_daily_features = golden_item_daily_features(item_daily_features_cleaned[item_daily_features_cleaned["date"] < train_test_dates[1]])
test_item_daily_features = golden_item_daily_features(item_daily_features_cleaned[item_daily_features_cleaned["date"] >= train_test_dates[1]])
train_item_daily_features

Unnamed: 0,video_id,video_tag_id,video_duration,show_cnt,valid_play_cnt,play_progress,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,download_cnt,cancel_like_cnt,delete_comment_cnt,cancel_follow_cnt,cancel_collect_cnt,date
0,0,841,5966,456852,109880,0.343875,11611,253,55,5315,101,100,2017,8,0,58.0,2020-07-28
1,2,2566,8000,2116,225,0.130692,4,0,0,1,0,0,74,0,0,0.0,2020-07-28
2,4,2413,18000,51,4,0.121556,0,0,0,0,0,0,3,0,0,0.0,2020-07-28
3,5,121,8000,14199,4063,0.335539,772,35,25,19,81,0,295,0,0,33.0,2020-07-28
4,6,381,6000,2181,89,0.087325,12,1,0,0,0,1,16,0,0,1.0,2020-07-28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6804,10402,1332,20016,39745,25341,0.585756,361,16,0,35,0,13,17,0,0,0.0,2020-08-19
6805,10403,2376,49015,930458,682239,0.555736,11178,98,91,1175,11,142,555,4,0,0.0,2020-08-19
6806,10404,2771,10434,147583,69839,0.567969,2890,51,62,267,1,32,61,0,0,0.0,2020-08-19
6807,10405,2553,14133,1745023,1149654,0.650378,91312,4525,999,984,311,549,3910,334,0,3.0,2020-08-19


#### user_features:

In [10]:
user_features_cleaned = user_features.copy()
useful_cols = ["user_id", "user_active_degree", "is_lowactive_period", "is_live_streamer", "is_video_author", "follow_user_num", "fans_user_num", "friend_user_num", "register_days"]
useful_cols += [f"onehot_feat{i}" for i in range(18)]
user_features_cleaned = user_features_cleaned[useful_cols]
#user_features_cleaned

#### caption_category

In [11]:
caption_category_cleaned = caption_category.copy()
useful_cols = ["video_id", "first_level_category_id"]
caption_category_cleaned = caption_category_cleaned[useful_cols]
caption_category_cleaned

Unnamed: 0,video_id,first_level_category_id
0,0,8
2,2,9
3,3,26
4,4,5
5,5,6
...,...,...
10722,10722,5
10723,10723,33
10724,10724,6
10726,10726,38


# Aggregating Train and Test Data

In [12]:
train_data = small_matrix_cleaned.copy()[small_matrix_cleaned["timestamp"] < train_test_dates[1]]
test_data = small_matrix_cleaned.copy()[small_matrix_cleaned["timestamp"] >= train_test_dates[1]]

train_data = train_data.merge(social_network_cleaned, on="user_id", how="left")
train_data = train_data.merge(item_categories_cleaned, on="video_id", how="left")
train_data = train_data.merge(train_item_daily_features, on="video_id", how="left")
train_data = train_data.merge(user_features_cleaned, on="user_id", how="left")
train_data = train_data.merge(caption_category_cleaned, on="video_id", how="left")

test_data = test_data.merge(social_network_cleaned, on="user_id", how="left")
test_data = test_data.merge(item_categories_cleaned, on="video_id", how="left")
test_data = test_data.merge(test_item_daily_features, on="video_id", how="left")
test_data = test_data.merge(user_features_cleaned, on="user_id", how="left")
test_data = test_data.merge(caption_category_cleaned, on="video_id", how="left")

In [13]:
export_dir = "./exports"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
train_data.to_parquet(export_dir + "/train_data.pq")
test_data.to_parquet(export_dir + "/test_data.pq")

In [14]:
train_data.columns

Index(['user_id', 'video_id', 'timestamp', 'watch_ratio', 'friend_list',
       'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3', 'feat_4',
       'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11',
       'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17',
       'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23',
       'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29',
       'feat_30', 'video_tag_id', 'video_duration', 'show_cnt',
       'valid_play_cnt', 'play_progress', 'like_cnt', 'comment_cnt',
       'share_cnt', 'follow_cnt', 'collect_cnt', 'download_cnt',
       'cancel_like_cnt', 'delete_comment_cnt', 'cancel_follow_cnt',
       'cancel_collect_cnt', 'date', 'user_active_degree',
       'is_lowactive_period', 'is_live_streamer', 'is_video_author',
       'follow_user_num', 'fans_user_num', 'friend_user_num', 'register_days',
       'onehot_feat0', 'onehot_feat1', 'onehot_feat2', 'onehot_feat3',
       'one