# DOWNLOADING THE DATASET

In [1]:
"""%%bash
wget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470
' -O KuaiRec.zip
unzip KuaiRec.zip -d data_final_project"""

"%%bash\nwget --no-check-certificate 'https://drive.usercontent.google.com/download?id=1qe5hOSBxzIuxBb1G_Ih5X-O65QElollE&export=download&confirm=t&uuid=b2002093-cc6e-4bd5-be47-9603f0b33470\n' -O KuaiRec.zip\nunzip KuaiRec.zip -d data_final_project"

# IMPORTS FOR THE NOTEBOOK

In [2]:
import os
import pandas as pd
import ast
from tqdm import tqdm

# LOADING THE DATA

In [3]:
root = "data_final_project/KuaiRec 2.0/data/"
big_matrix = pd.read_csv(root + "big_matrix.csv").drop_duplicates().dropna()
small_matrix = pd.read_csv(root + "small_matrix.csv").drop_duplicates().dropna()
social_network = pd.read_csv(root + "social_network.csv").drop_duplicates().dropna()
user_features = pd.read_csv(root + "user_features.csv").drop_duplicates().dropna()
item_daily_features = pd.read_csv(root + "item_daily_features.csv").drop_duplicates().dropna()
item_categories = pd.read_csv(root + "item_categories.csv").drop_duplicates().dropna()
caption_category = pd.read_csv(root + "kuairec_caption_category.csv", lineterminator='\n').drop_duplicates().dropna()
# There is a category id equal to -124 and no category being 30, so I set the negative one to be 30 for practical reasons
caption_category['first_level_category_id'] = [x if x >= 0 else 30 for x in caption_category['first_level_category_id']]

# Data Cleaning

##### Cleaning steps:
- small/big_matrix: watch_ratio (we remove unrealistic ones)
- social_network: change friend list to actual list type (not str), add a column for friends count
- item_categories: we explode feats and encode them
- item_daily_features: we keep only NORMAL videos because we do not want to recommend ads, we keep only videos which are in public visible status because we can't recommend a private video, we also apply cumulative sums on the data to get the stats up to a certain day
- user_features: good as-is
- caption_category: good as-is

In [4]:
item_daily_features_cleaned = item_daily_features.copy()
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["visible_status"] == "public"]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["video_type"] == "NORMAL"]

In [5]:
known_user_ids = set(user_features["user_id"].unique())
known_video_ids = set(caption_category["video_id"].unique()) & set(item_categories["video_id"].unique()) & set(item_daily_features_cleaned["video_id"].unique())
print(f"We have data for {len(known_user_ids)} users and {len(known_video_ids)} videos")

We have data for 6899 users and 8754 videos


In [6]:
print(big_matrix["time"].min(), big_matrix["time"].max())
print(small_matrix["time"].min(), small_matrix["time"].max())

2020-06-23 08:34:11.373 2020-09-10 07:32:12.427
2020-07-04 02:23:26.06 2020-09-05 23:57:23.683


#### big & small_matrix:

In [None]:
def clean_big_small_matrix(df: pd.DataFrame) -> pd.DataFrame:
    cleaned = df.copy()
    cleaned = cleaned.drop(columns=["play_duration", "date", "timestamp"])
    # We remove duplicates, null values and filter out unrealistic watch_ratio values
    cleaned = cleaned.dropna().drop_duplicates()
    cleaned = cleaned[cleaned["watch_ratio"] < 5]
    cleaned = cleaned[cleaned["watch_ratio"] != 0]
    # We remove interactions with unknown users or unknown videos
    cleaned = cleaned[cleaned["user_id"].isin(known_user_ids)]
    cleaned = cleaned[cleaned["video_id"].isin(known_video_ids)]
    # We convert times to actual times
    cleaned["time"] = pd.to_datetime(cleaned["time"], format="%Y-%m-%d %H:%M:%S.%f", errors="coerce")
    # We keep the best watch ratio if a user interacted multiple time with the same video, but we keep temporality and duplicate it

    # PUT IT BACK
    #cleaned["watch_ratio"] = cleaned.groupby(["user_id", "video_id"])["watch_ratio"].transform("max")

    cleaned["watch_ratio"] = cleaned.groupby(["user_id", "video_id"])["watch_ratio"].transform("sum")
    cleaned = cleaned[cleaned["watch_ratio"] < 5]

    assert(not cleaned.isnull().sum().any())
    return cleaned

small_matrix_cleaned = clean_big_small_matrix(small_matrix)
big_matrix_cleaned = clean_big_small_matrix(big_matrix)

#### social_network:

In [8]:
social_network_cleaned = social_network.copy()
# We remove unknown users
social_network_cleaned = social_network_cleaned[social_network_cleaned.loc[:, "user_id"].isin(known_user_ids)]
# Convert string to actual list of ints
social_network_cleaned["friend_list"] = social_network_cleaned["friend_list"].apply(ast.literal_eval)
# Add friend_count row
social_network_cleaned["friend_count"] = social_network_cleaned["friend_list"].apply(lambda x : len(x))
# Add users with no friends for consistency
missing_ids = [id for id in known_user_ids if id not in social_network_cleaned["user_id"].unique()]
missing_users_social = pd.DataFrame({"user_id": missing_ids, "friend_list": [[] for _ in range(len(missing_ids))], "friend_count": [0] * len(missing_ids)})
social_network_cleaned = pd.concat([social_network_cleaned, missing_users_social], ignore_index=True)
assert(len(social_network_cleaned) == len(known_user_ids))
# 93.5% of users dont have friends so we won't use it
print(len(social_network_cleaned[social_network_cleaned["friend_count"] == 0]) / len(social_network_cleaned["friend_count"]) * 100)

93.50630526163212


#### item_categories:

In [9]:
item_categories_cleaned = item_categories.copy()
item_categories_cleaned = item_categories_cleaned[item_categories_cleaned.loc[:, "video_id"].isin(known_video_ids)]
item_categories_cleaned["feat"] = item_categories_cleaned["feat"].apply(ast.literal_eval)
#item_categories_cleaned = item_categories_cleaned.explode("feat")
# Create the vector encoding
#item_categories_cleaned = item_categories_cleaned.assign(present=1).pivot(index="video_id", columns="feat", values="present").fillna(0).reindex(columns=sorted(item_categories_cleaned["feat"].unique())).astype(int)
#item_categories_cleaned.columns = [f"feat_{feat}" for feat in item_categories_cleaned.columns]
#item_categories_cleaned = item_categories_cleaned.reset_index()
#assert(len(item_categories_cleaned["video_id"]) == len(known_video_ids))
item_categories_cleaned

Unnamed: 0,video_id,feat
0,0,[8]
2,2,[9]
4,4,[5]
5,5,[6]
6,6,[19]
...,...,...
10722,10722,[5]
10723,10723,[11]
10724,10724,[2]
10726,10726,[19]


#### item_daily_features:

In [10]:
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["video_id"].isin(known_video_ids)]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["visible_status"] == "public"]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["video_type"] == "NORMAL"]
item_daily_features_cleaned["date"] = pd.to_datetime(item_daily_features_cleaned["date"].astype(str), errors="coerce")
item_daily_features_cleaned["upload_dt"] = pd.to_datetime(item_daily_features_cleaned["upload_dt"].astype(str), errors="coerce")
stats_cols = ["valid_play_cnt", "like_cnt", "comment_cnt", "share_cnt", "follow_cnt", "collect_cnt", "download_cnt"]
useful_cols = ["video_id", "date", "video_duration", "upload_dt"] + stats_cols
item_daily_features_cleaned = item_daily_features_cleaned[useful_cols]
item_daily_features_cleaned = item_daily_features_cleaned.sort_values(["date", "video_id"])
item_daily_features_cleaned[stats_cols] = item_daily_features_cleaned.groupby("video_id")[stats_cols].cumsum()
item_daily_features_cleaned

Unnamed: 0,video_id,date,video_duration,upload_dt,valid_play_cnt,like_cnt,comment_cnt,share_cnt,follow_cnt,collect_cnt,download_cnt
23,0,2020-07-28,5966.0,2020-03-30,4280,462,10,0,247,2.0,3
149,2,2020-07-28,8000.0,2020-04-11,7,1,0,0,0,0.0,0
274,4,2020-07-28,18000.0,2020-04-12,0,0,0,0,0,0.0,0
327,5,2020-07-28,8000.0,2020-04-15,190,37,2,2,4,5.0,0
390,6,2020-07-28,6000.0,2020-04-15,6,1,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
343335,10722,2020-09-05,9700.0,2020-09-05,143,4,1,0,1,0.0,0
343336,10723,2020-09-05,4833.0,2020-09-05,114,24,0,0,0,0.0,0
343337,10724,2020-09-05,54720.0,2020-09-05,754,264,29,1,8,0.0,1
343339,10726,2020-09-05,5132.0,2020-09-05,5382,44,0,1,2,0.0,2


#### user_features:

In [11]:
user_features_cleaned = user_features.copy()
user_features_cleaned = user_features_cleaned[user_features_cleaned.loc[:, "user_id"].isin(known_user_ids)]
useful_cols = ["user_id", "user_active_degree", "is_lowactive_period", "is_live_streamer", "is_video_author", "follow_user_num", "fans_user_num", "friend_user_num", "register_days"]
useful_cols += [f"onehot_feat{i}" for i in range(18)]
user_features_cleaned = user_features_cleaned[useful_cols]
#user_features_cleaned

#### caption_category

In [12]:
caption_category_cleaned = caption_category.copy()
useful_cols = ["video_id", "first_level_category_id"]
caption_category_cleaned = caption_category_cleaned[useful_cols]
# Create the vector encoding
#caption_category_cleaned = caption_category_cleaned.assign(present=1).pivot(index="video_id", columns="first_level_category_id", values="present").fillna(0).reindex(columns=sorted(caption_category_cleaned["first_level_category_id"].unique())).astype(int)
#caption_category_cleaned.columns = [f"category_{category}" for category in caption_category_cleaned.columns]
#caption_category_cleaned = caption_category_cleaned.reset_index()
#assert(len(caption_category_cleaned["video_id"]) == len(known_video_ids))
caption_category_cleaned

Unnamed: 0,video_id,first_level_category_id
0,0,8
2,2,9
3,3,26
4,4,5
5,5,6
...,...,...
10722,10722,5
10723,10723,33
10724,10724,6
10726,10726,38


# Aggregating Train and Test Data

In [13]:
"""train_data = big_matrix_cleaned.copy().sort_values(["time", "video_id"])
test_data = small_matrix_cleaned.copy().sort_values(["time", "video_id"])

train_data = train_data.merge(social_network_cleaned, on="user_id", how="left")
train_data = train_data.merge(item_categories_cleaned, on="video_id", how="left")
train_data = train_data.merge(user_features_cleaned, on="user_id", how="left")
train_data = train_data.merge(caption_category_cleaned, on="video_id", how="left")
train_data = pd.merge_asof(train_data, item_daily_features_cleaned, by="video_id", left_on="time", right_on="date", direction="backward")
train_data = train_data.fillna(0)

test_data = test_data.merge(social_network_cleaned, on="user_id", how="left")
test_data = test_data.merge(item_categories_cleaned, on="video_id", how="left")
test_data = test_data.merge(user_features_cleaned, on="user_id", how="left")
test_data = test_data.merge(caption_category_cleaned, on="video_id", how="left")
test_data = pd.merge_asof(test_data, item_daily_features_cleaned, by="video_id", left_on="time", right_on="date", direction="backward")
test_data = test_data.fillna(0)"""

'train_data = big_matrix_cleaned.copy().sort_values(["time", "video_id"])\ntest_data = small_matrix_cleaned.copy().sort_values(["time", "video_id"])\n\ntrain_data = train_data.merge(social_network_cleaned, on="user_id", how="left")\ntrain_data = train_data.merge(item_categories_cleaned, on="video_id", how="left")\ntrain_data = train_data.merge(user_features_cleaned, on="user_id", how="left")\ntrain_data = train_data.merge(caption_category_cleaned, on="video_id", how="left")\ntrain_data = pd.merge_asof(train_data, item_daily_features_cleaned, by="video_id", left_on="time", right_on="date", direction="backward")\ntrain_data = train_data.fillna(0)\n\ntest_data = test_data.merge(social_network_cleaned, on="user_id", how="left")\ntest_data = test_data.merge(item_categories_cleaned, on="video_id", how="left")\ntest_data = test_data.merge(user_features_cleaned, on="user_id", how="left")\ntest_data = test_data.merge(caption_category_cleaned, on="video_id", how="left")\ntest_data = pd.merge_aso

# SAVING CLEANED DATA

In [14]:
export_dir = "./exports"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
big_matrix_cleaned.to_parquet(export_dir + "/big_matrix_cleaned.pq")
small_matrix_cleaned.to_parquet(export_dir + "/small_matrix_cleaned.pq")
social_network_cleaned.to_parquet(export_dir + "/social_network_cleaned.pq")
user_features_cleaned.to_parquet(export_dir + "/user_features_cleaned.pq")
item_categories_cleaned.to_parquet(export_dir + "/item_categories_cleaned.pq")
item_daily_features_cleaned.to_parquet(export_dir + "/item_daily_features_cleaned.pq")
caption_category_cleaned.to_parquet(export_dir + "/caption_category_cleaned.pq")