In [1]:
"""%%bash
wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470
' -O KuaiRec.zip
unzip KuaiRec.zip -d data_final_project"""

"%%bash\nwget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470\n' -O KuaiRec.zip\nunzip KuaiRec.zip -d data_final_project"

# Imports

In [2]:
import os
import pandas as pd
import ast

# Loading the data

In [3]:
root = "data_final_project/KuaiRec 2.0/data/"
big_matrix = pd.read_csv(root + "big_matrix.csv").drop_duplicates().dropna()
small_matrix = pd.read_csv(root + "small_matrix.csv").drop_duplicates().dropna()
social_network = pd.read_csv(root + "social_network.csv").drop_duplicates().dropna()
user_features = pd.read_csv(root + "user_features.csv").drop_duplicates().dropna()
item_daily_features = pd.read_csv(root + "item_daily_features.csv").drop_duplicates().dropna()
item_categories = pd.read_csv(root + "item_categories.csv").drop_duplicates().dropna()
caption_category = pd.read_csv(root + "kuairec_caption_category.csv", lineterminator='\n').drop_duplicates().dropna()
# There is a category id equal to -124 and no category being 30, so I set the negative one to be 30 for practical reasons
caption_category['first_level_category_id'] = [x if x >= 0 else 30 for x in caption_category['first_level_category_id']]

# Data Cleaning

##### Cleaning steps:
- small/big_matrix: watch_ratio (we remove unrealistic ones)
- social_network: change friend list to actual list type (not str), add a column for friends count
- item_categories: we explode feats and encode them
- item_daily_features: we keep only NORMAL videos because we do not want to recommend ads, we keep only videos which are in public visible status because we can't recommend a private video, we also apply cumulative sums on the data to get the stats up to a certain day
- user_features: good as-is
- caption_category: good as-is

In [4]:
known_user_ids = set(user_features["user_id"].unique())
known_video_ids = set(caption_category["video_id"].unique()) & set(item_categories["video_id"].unique()) & set(item_daily_features["video_id"].unique())
print(f"We have data for {len(known_user_ids)} users and {len(known_video_ids)} videos")

We have data for 6899 users and 8846 videos


#### big & small_matrix:

In [5]:
def clean_big_small_matrix(df: pd.DataFrame) -> pd.DataFrame:
    cleaned = df.copy()
    cleaned = cleaned.drop(columns=["play_duration", "date", "time"])
    # We remove duplicates, null values and filter out unrealistic watch_ratio values
    cleaned = cleaned.dropna().drop_duplicates()
    cleaned = cleaned[cleaned.loc[:, "watch_ratio"] < 5]
    # We remove interactions with unknown users or unknown videos
    cleaned = cleaned[cleaned.loc[:, "user_id"].isin(known_user_ids)]
    cleaned = cleaned[cleaned.loc[:, "video_id"].isin(known_video_ids)]
    # We convert times to actual times
    cleaned["timestamp"] = pd.to_datetime(cleaned["timestamp"], unit="s", errors="coerce")
    assert(not cleaned.isnull().sum().any())
    return cleaned

small_matrix_cleaned = clean_big_small_matrix(small_matrix)
big_matrix_cleaned = clean_big_small_matrix(big_matrix)

#### social_network:

In [6]:
social_network_cleaned = social_network.copy()
# We remove unknown users
social_network_cleaned = social_network_cleaned[social_network_cleaned.loc[:, "user_id"].isin(known_user_ids)]
# Convert string to actual list of ints
social_network_cleaned["friend_list"] = social_network_cleaned["friend_list"].apply(ast.literal_eval)
# Add friend_count row
social_network_cleaned["friend_count"] = social_network_cleaned["friend_list"].apply(lambda x : len(x))
# Add users with no friends for consistency
missing_ids = [id for id in known_user_ids if id not in social_network_cleaned["user_id"].unique()]
missing_users_social = pd.DataFrame({"user_id": missing_ids, "friend_list": [[] for _ in range(len(missing_ids))], "friend_count": [0] * len(missing_ids)})
social_network_cleaned = pd.concat([social_network_cleaned, missing_users_social], ignore_index=True)
assert(len(social_network_cleaned) == len(known_user_ids))
social_network_cleaned

Unnamed: 0,user_id,friend_list,friend_count
0,3371,[2975],1
1,24,[2665],1
2,4402,[38],1
3,4295,[4694],1
4,7087,[7117],1
...,...,...,...
6894,7170,[],0
6895,7171,[],0
6896,7172,[],0
6897,7173,[],0


#### item_categories:

In [7]:
item_categories_cleaned = item_categories.copy()
item_categories_cleaned = item_categories_cleaned[item_categories_cleaned.loc[:, "video_id"].isin(known_video_ids)]
item_categories_cleaned["feat"] = item_categories_cleaned["feat"].apply(ast.literal_eval)
item_categories_cleaned = item_categories_cleaned.explode("feat")
# Create the vector encoding
item_categories_cleaned = item_categories_cleaned.assign(present=1).pivot(index="video_id", columns="feat", values="present").fillna(0).reindex(columns=sorted(item_categories_cleaned["feat"].unique())).astype(int)
item_categories_cleaned.columns = [f"feat_{feat}" for feat in item_categories_cleaned.columns]
item_categories_cleaned = item_categories_cleaned.reset_index()
assert(len(item_categories_cleaned["video_id"]) == len(known_video_ids))
item_categories_cleaned

Unnamed: 0,video_id,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,...,feat_21,feat_22,feat_23,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8841,10722,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8842,10723,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8843,10724,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8844,10726,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### item_daily_features:

In [8]:
item_daily_features_cleaned = item_daily_features.copy()
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["visible_status"] == "public"]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["video_type"] == "NORMAL"]
item_daily_features_cleaned["date"] = pd.to_datetime(item_daily_features_cleaned["date"].astype(str), errors="coerce")
stats_cols = ["show_cnt", "valid_play_cnt", "like_cnt", "comment_cnt", "share_cnt", "follow_cnt", "collect_cnt", "download_cnt", "cancel_like_cnt", "delete_comment_cnt", "cancel_follow_cnt", "cancel_collect_cnt"]
useful_cols = ["video_id", "date"] + stats_cols
item_daily_features_cleaned = item_daily_features_cleaned[useful_cols]
item_daily_features_cleaned = item_daily_features_cleaned.sort_values(["date", "video_id"])
item_daily_features_cleaned[stats_cols] = item_daily_features_cleaned.groupby("video_id")[stats_cols].cumsum()
item_daily_features_cleaned

Unnamed: 0,video_id,date,show_cnt,valid_play_cnt,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,download_cnt,cancel_like_cnt,delete_comment_cnt,cancel_follow_cnt,cancel_collect_cnt
23,0,2020-07-28,13957,4280,462,10,0,247,2.0,3,85,0,0,0.0
149,2,2020-07-28,95,7,1,0,0,0,0.0,0,4,0,0,0.0
274,4,2020-07-28,9,0,0,0,0,0,0.0,0,0,0,0,0.0
327,5,2020-07-28,1016,190,37,2,2,4,5.0,0,12,0,0,2.0
390,6,2020-07-28,281,6,1,0,0,0,0.0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343336,10723,2020-09-05,277,114,24,0,0,0,0.0,0,0,0,0,0.0
343337,10724,2020-09-05,1100,754,264,29,1,8,0.0,1,14,0,0,0.0
343338,10725,2020-09-05,16996,9317,851,36,3,12,0.0,5,19,0,0,0.0
343339,10726,2020-09-05,7644,5382,44,0,1,2,0.0,2,1,0,0,0.0


#### user_features:

In [9]:
user_features_cleaned = user_features.copy()
useful_cols = ["user_id", "user_active_degree", "is_lowactive_period", "is_live_streamer", "is_video_author", "follow_user_num", "fans_user_num", "friend_user_num", "register_days"]
useful_cols += [f"onehot_feat{i}" for i in range(18)]
user_features_cleaned = user_features_cleaned[useful_cols]
#user_features_cleaned

#### caption_category

In [10]:
caption_category_cleaned = caption_category.copy()
useful_cols = ["video_id", "first_level_category_id"]
caption_category_cleaned = caption_category_cleaned[useful_cols]
caption_category_cleaned

Unnamed: 0,video_id,first_level_category_id
0,0,8
2,2,9
3,3,26
4,4,5
5,5,6
...,...,...
10722,10722,5
10723,10723,33
10724,10724,6
10726,10726,38


# Aggregating Train and Test Data

In [11]:
train_data = big_matrix_cleaned.copy().sort_values(["timestamp", "video_id"])
test_data = small_matrix_cleaned.copy().sort_values(["timestamp", "video_id"])

train_data = train_data.merge(social_network_cleaned, on="user_id", how="left")
train_data = train_data.merge(item_categories_cleaned, on="video_id", how="left")
train_data = train_data.merge(user_features_cleaned, on="user_id", how="left")
train_data = train_data.merge(caption_category_cleaned, on="video_id", how="left")
train_data = pd.merge_asof(train_data, item_daily_features_cleaned, by="video_id", left_on="timestamp", right_on="date", direction="backward")
train_data = train_data.fillna(0)

test_data = test_data.merge(social_network_cleaned, on="user_id", how="left")
test_data = test_data.merge(item_categories_cleaned, on="video_id", how="left")
test_data = test_data.merge(user_features_cleaned, on="user_id", how="left")
test_data = test_data.merge(caption_category_cleaned, on="video_id", how="left")
test_data = pd.merge_asof(test_data, item_daily_features_cleaned, by="video_id", left_on="timestamp", right_on="date", direction="backward")
test_data = test_data.fillna(0)

In [12]:
train_data = train_data.drop(columns=["date"])
test_data = test_data.drop(columns=["date"])
export_dir = "./exports"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
train_data.to_parquet(export_dir + "/train_data.pq")
test_data.to_parquet(export_dir + "/test_data.pq")

In [13]:
train_data.columns

Index(['user_id', 'video_id', 'video_duration', 'timestamp', 'watch_ratio',
       'friend_list', 'friend_count', 'feat_0', 'feat_1', 'feat_2', 'feat_3',
       'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10',
       'feat_11', 'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16',
       'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22',
       'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28',
       'feat_29', 'feat_30', 'user_active_degree', 'is_lowactive_period',
       'is_live_streamer', 'is_video_author', 'follow_user_num',
       'fans_user_num', 'friend_user_num', 'register_days', 'onehot_feat0',
       'onehot_feat1', 'onehot_feat2', 'onehot_feat3', 'onehot_feat4',
       'onehot_feat5', 'onehot_feat6', 'onehot_feat7', 'onehot_feat8',
       'onehot_feat9', 'onehot_feat10', 'onehot_feat11', 'onehot_feat12',
       'onehot_feat13', 'onehot_feat14', 'onehot_feat15', 'onehot_feat16',
       'onehot_feat17', 'first_level_catego