## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import ast
import re

SEED = 42
warnings.filterwarnings("ignore")
np.random.seed(SEED)

## Load data

In [2]:
data_path = "data_final_project/KuaiRec 2.0/data/"

# load small matrix because big matrix breaks the ram
small_matrix = pd.read_csv(data_path + "small_matrix.csv").drop_duplicates().dropna()
user_features = pd.read_csv(data_path + "user_features.csv").drop_duplicates().dropna()
item_categories = pd.read_csv(data_path + "item_categories.csv").drop_duplicates().dropna()
item_daily_features = pd.read_csv(data_path + "item_daily_features.csv").drop_duplicates().dropna()
social_network = pd.read_csv(data_path + "social_network.csv").drop_duplicates().dropna()
caption_category = pd.read_csv(data_path + "kuairec_caption_category.csv", lineterminator='\n').drop_duplicates().dropna()

In [3]:
small_matrix.head()

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0.418364


In [4]:
user_features.head()

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,high_active,0,0,0,5,"(0,10]",0,0,0,...,184,6,3,0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,full_active,0,0,0,386,"(250,500]",4,"[1,10)",2,...,186,6,2,0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,full_active,0,0,0,27,"(10,50]",0,0,0,...,51,2,3,0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,full_active,0,0,0,16,"(10,50]",0,0,0,...,251,3,2,0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,full_active,0,0,0,122,"(100,150]",4,"[1,10)",0,...,99,4,2,0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
item_categories.head()

Unnamed: 0,video_id,feat
0,0,[8]
1,1,"[27, 9]"
2,2,[9]
3,3,[26]
4,4,[5]


In [6]:
item_daily_features.head()

Unnamed: 0,video_id,date,author_id,video_type,upload_dt,upload_type,visible_status,video_duration,video_width,video_height,...,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
23,0,20200728,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,3,3,2.0,2.0,0.0,0.0
24,0,20200729,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,5,5,0,0,5,5,7.0,6.0,4.0,4.0
25,0,20200730,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,1,1,6.0,6.0,3.0,3.0
26,0,20200731,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,5,5,3.0,3.0,2.0,2.0
27,0,20200801,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,3,3,7.0,7.0,3.0,3.0


In [7]:
social_network.head()

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]


In [8]:
caption_category.head()

Unnamed: 0,video_id,manual_cover_text,caption,topic_tag,first_level_category_id,first_level_category_name,second_level_category_id,second_level_category_name,third_level_category_id,third_level_category_name
0,0,UNKNOWN,精神小伙路难走 程哥你狗粮慢点撒,[],8,颜值,673,颜值随拍,-124,UNKNOWN
2,2,UNKNOWN,晚饭后，运动一下！,[],9,喜剧,727,搞笑互动,-124,UNKNOWN
3,3,UNKNOWN,我平淡无奇，惊艳不了时光，温柔不了岁月，我只想漫无目的的走走，努力发笔小财，给自己买花 自己长大.,[],26,摄影,686,主题摄影,2434,景物摄影
4,4,五爱街最美美女 一天1q,#搞笑 #感谢快手我要上热门 #五爱市场 这真是完美搭配啊！,"[五爱市场,感谢快手我要上热门,搞笑]",5,时尚,737,营销售卖,2596,女装
5,5,UNKNOWN,“你们吵的越狠 他们的手就握的越紧” #文轩 #刘耀文 #宋亚轩 #顾子璇...,"[刘耀文,宋亚轩,文轩,顾子璇是樱桃吖,顾子璇超级喜欢文轩]",6,明星娱乐,667,娱乐八卦,2375,饭制


## Fix negative category id to be the 30th one

In [9]:
categories = caption_category["first_level_category_id"].unique()
categories.sort()
categories

array([-124,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   31,   32,   33,
         34,   35,   36,   37,   38,   39])

In [10]:
caption_category["first_level_category_id"] = [x if x >= 0 else 30 for x in caption_category["first_level_category_id"]]

## Data cleaning

### Remove ads and private videos

In [11]:
item_daily_features_cleaned = item_daily_features[item_daily_features["video_type"] == "NORMAL"]
item_daily_features_cleaned = item_daily_features_cleaned[item_daily_features_cleaned["visible_status"] == "public"]
item_daily_features_cleaned

Unnamed: 0,video_id,date,author_id,video_type,upload_dt,upload_type,visible_status,video_duration,video_width,video_height,...,download_cnt,download_user_num,report_cnt,report_user_num,reduce_similar_cnt,reduce_similar_user_num,collect_cnt,collect_user_num,cancel_collect_cnt,cancel_collect_user_num
23,0,20200728,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,3,3,2.0,2.0,0.0,0.0
24,0,20200729,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,5,5,0,0,5,5,7.0,6.0,4.0,4.0
25,0,20200730,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,3,3,0,0,1,1,6.0,6.0,3.0,3.0
26,0,20200731,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,5,5,3.0,3.0,2.0,2.0
27,0,20200801,3309,NORMAL,2020-03-30,ShortImport,public,5966.0,720,1280,...,9,9,0,0,3,3,7.0,7.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343336,10723,20200905,236,NORMAL,2020-09-05,ShortImport,public,4833.0,720,1280,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0
343337,10724,20200905,5271,NORMAL,2020-09-05,LongImport,public,54720.0,720,1280,...,1,1,0,0,0,0,0.0,0.0,0.0,0.0
343338,10725,20200905,1924,NORMAL,2020-09-05,ShortImport,public,15800.0,576,1024,...,5,5,0,0,4,4,0.0,0.0,0.0,0.0
343339,10726,20200905,7604,NORMAL,2020-09-05,ShortImport,public,5132.0,528,960,...,2,2,0,0,1,1,0.0,0.0,0.0,0.0


### Set known user and video ids which are known throughout all databases to avoid NaN

In [12]:
known_user_ids = set(user_features["user_id"].unique()) & set(small_matrix["user_id"].unique())
known_video_ids = set(caption_category["video_id"].unique()) & set(small_matrix["video_id"].unique()) & set(item_daily_features_cleaned["video_id"].unique())

### small_matrix / big_matrix
- time is kept for train / validation / test splitting
- watch_ratio 
    - clean unrealistic data (more than 5 rewatch) 
    - split the interation in 3 categories:
        - negative (less than 0.65)
        - neutral (between 0.65 and 1.2)
        - like (more than 1.2)
- video duration
    - remove irrelevant data (video lasts for more than 2.6 minutes)
    - split the interaction in 3 categories:
        - short video (less than 7 seconds)
        - medium video (between 7 and 16 seconds)
        - long video (longer than 16 seconds)

All user_id / video_id which are not present in the "user_features" / "caption_category" dataframe are removed since they lack information.

In [13]:
# Copy the dataframe while only keeping the relevant columns
small_matrix_cleaned = small_matrix.copy()
small_matrix_cleaned = small_matrix_cleaned[["user_id", "video_id", "watch_ratio", "time", "video_duration"]]

# Remove irrelevant / unrealistic data
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["video_duration"] <= 160000]
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["watch_ratio"] <= 5]

# Categorize watch_ratio
small_matrix_cleaned["like"] = np.where(small_matrix_cleaned["watch_ratio"] <= 0.65, -1,
                                np.where(small_matrix_cleaned["watch_ratio"] <= 1.2, 0, 1))

# Categorize video_length
small_matrix_cleaned["video_length"] = np.where(small_matrix_cleaned["video_duration"] <= 7000, -1,
                                np.where(small_matrix_cleaned["video_duration"] <= 16000, 0, 1))

# Drop useless columns
#small_matrix_cleaned = small_matrix_cleaned.drop(columns=["watch_ratio", "video_duration"])
small_matrix_cleaned = small_matrix_cleaned.drop(columns=["video_duration"])

# Remove users which lack information
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["user_id"].isin(known_user_ids)]

# Remove videos which lack information
small_matrix_cleaned = small_matrix_cleaned[small_matrix_cleaned["video_id"].isin(known_video_ids)]

# Check dataframe
small_matrix_cleaned.head()

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length
0,14,148,0.722103,2020-07-05 05:27:48.378,0,-1
1,14,183,1.907377,2020-07-05 05:28:00.057,1,-1
2,14,3649,2.063311,2020-07-05 05:29:09.479,1,0
3,14,5262,0.566388,2020-07-05 05:30:43.285,-1,0
4,14,8234,0.418364,2020-07-05 05:35:43.459,-1,0


### Item_categories
- Explode video tags and create a one hot encoded vector containing the tags of each video

All video_id which are not present in the "caption_category" dataframe are removed since they lack information.

In [14]:
# Copy the dataframe and fix its format
item_categories_cleaned = item_categories.copy()
item_categories_cleaned["feat"] = item_categories_cleaned["feat"].apply(ast.literal_eval)

# Explode item tag lists
item_categories_cleaned = item_categories_cleaned.explode("feat")

# Make it a one hot encoded vector
item_categories_cleaned = pd.crosstab(item_categories_cleaned["video_id"], item_categories_cleaned["feat"])

# Set columns' names
item_categories_cleaned.columns = [f"tag_{i}" for i in range(31)]

# Fix indexes
item_categories_cleaned = item_categories_cleaned.reset_index()

# Remove videos which lack information
item_categories_cleaned = item_categories_cleaned[item_categories_cleaned["video_id"].isin(known_video_ids)]

# Check dataframe
item_categories_cleaned.head()

Unnamed: 0,video_id,tag_0,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,...,tag_21,tag_22,tag_23,tag_24,tag_25,tag_26,tag_27,tag_28,tag_29,tag_30
103,103,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109,109,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120,120,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
122,122,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130,130,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Social network

In [15]:
# Copy the dataframe and fix its format
social_network_cleaned = social_network.copy()
social_network_cleaned["friend_list"] = social_network_cleaned["friend_list"].apply(ast.literal_eval)

# Remove users which lack information
social_network_cleaned = social_network_cleaned[social_network_cleaned["user_id"].isin(known_video_ids)]

# Check dataframe
social_network_cleaned.head()

Unnamed: 0,user_id,friend_list
4,7087,[7117]
6,676,[5626]
10,578,"[1848, 6260]"
11,6055,"[730, 1103]"
12,670,[3732]


### User_features
- Keep all encoded features since all decoded features seem useless in our case
- Scale all encoded features

In [16]:
# Copy the dataframe while only keeping the relevant columns
user_features_cleaned = user_features.copy()
user_features_cleaned = user_features_cleaned[["user_id"] + [f'onehot_feat{i}' for i in range(18)]]

# Rescale data
for row in [f'onehot_feat{i}' for i in range(18)]:
    scaler = MinMaxScaler((-1, 1))
    user_features_cleaned[row] = scaler.fit_transform(user_features_cleaned[[row]])

# Check dataframe
user_features_cleaned.head()

Unnamed: 0,user_id,onehot_feat0,onehot_feat1,onehot_feat2,onehot_feat3,onehot_feat4,onehot_feat5,onehot_feat6,onehot_feat7,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,-1.0,-0.666667,0.172414,0.19141,-0.636364,-1.0,0.0,-0.73913,0.085546,1.0,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,-1.0,0.0,0.724138,0.906629,-1.0,-1.0,0.0,-0.73913,0.097345,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,-1.0,1.0,-0.448276,-0.2493,-1.0,-1.0,-1.0,-0.913043,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,-1.0,-0.666667,-0.448276,-0.475257,-1.0,-1.0,-1.0,0.478261,0.480826,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,-1.0,-0.666667,-0.448276,-0.409897,-0.818182,-1.0,0.0,1.0,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


### Caption_category
- Explode first_level_category_id and create a one hot encoded vector containing the first level category id of each video

In [17]:
# Copy the dataframe while only keeping the relevant columns
caption_category_cleaned = caption_category.copy()
caption_category_cleaned = caption_category_cleaned[["video_id", "first_level_category_id"]]

# Make the first level category id a one hot encoded vector
caption_category_cleaned = pd.crosstab(caption_category_cleaned["video_id"], caption_category_cleaned["first_level_category_id"])

# Set columns' names
caption_category_cleaned.columns = [f"category_{i}" for i in range(39)]

# Fix indexes
caption_category_cleaned = caption_category_cleaned.reset_index()

# Check dataframe
caption_category_cleaned.head()

Unnamed: 0,video_id,category_0,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,...,category_29,category_30,category_31,category_32,category_33,category_34,category_35,category_36,category_37,category_38
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Save Data

In [None]:
small_matrix_cleaned.to_parquet("data/small_matrix_cleaned.parquet")
user_features.to_parquet("data/user_features.parquet")
social_network_cleaned.to_parquet("data/social_network_cleaned.parquet")