## Import necessary libraries

In [1]:
# Data imports
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
from pathlib import Path
import utils

# Prepare environment

In [2]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

CLEANED_DATA_PATH = "../data/cleaned/"

INTERMEDIATE_SAVE_PATH = "../data/big_matrix_intermediate_saves/"
Path(INTERMEDIATE_SAVE_PATH).mkdir(parents=True, exist_ok=True)

FEATURES_PATH = "../data/features/"
Path(FEATURES_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [3]:
big_matrix_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "big_matrix_cleaned.parquet")
user_features_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "user_features_cleaned.parquet")
item_categories_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "item_categories_cleaned.parquet")
caption_category_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "caption_category_cleaned.parquet")
social_network_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")

# Check Data

In [4]:
big_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length
0,0,3649,1.273397,2020-07-05 00:08:23.438,1,0
2,0,5262,0.107613,2020-07-05 00:16:06.687,-1,0
3,0,1963,0.089885,2020-07-05 00:20:26.792,-1,0
4,0,8234,0.078000,2020-07-05 00:43:05.128,-1,0
5,0,8228,1.572295,2020-07-05 01:00:25.500,1,0
...,...,...,...,...,...,...
12530799,7175,6597,1.004462,2020-09-05 06:35:01.104,0,0
12530800,7175,6630,0.313389,2020-09-05 15:00:33.379,-1,0
12530803,7175,10360,0.340597,2020-09-05 19:10:29.041,-1,0
12530804,7175,10360,0.913400,2020-09-05 19:10:36.995,0,0


In [5]:
user_features_cleaned

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,0,0,0,0,5,0,0,0,0,...,0.085546,1.000000,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1,0,0,0,386,1,4,1,2,...,0.097345,1.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,1,0,0,0,27,2,0,0,0,...,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,1,0,0,0,16,2,0,0,0,...,0.480826,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,1,0,0,0,122,3,4,1,0,...,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,1,0,0,1,52,5,1,1,0,...,0.528024,-0.666667,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7172,7172,1,0,0,0,45,2,2,1,2,...,-0.935103,-0.333333,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7173,7173,1,0,0,0,615,4,3,1,2,...,-0.699115,-0.333333,0.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7174,7174,1,0,0,0,959,4,0,0,0,...,-0.368732,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [6]:
item_categories_cleaned

Unnamed: 0,video_id,feat,video_watch_ratio_mean
0,0,8,1.344039
2,2,9,1.163905
4,4,5,0.617457
5,5,6,0.862989
6,6,19,1.483494
...,...,...,...
10722,10722,5,1.167776
10723,10723,11,1.411720
10724,10724,2,1.450592
10726,10726,19,1.810170


In [7]:
caption_category_cleaned

Unnamed: 0,video_id,first_level_category_id
0,0,8
2,2,9
3,3,26
4,4,5
5,5,6
...,...,...
10722,10722,5
10723,10723,33
10724,10724,6
10726,10726,38


In [8]:
social_network_cleaned

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]
...,...,...
467,2331,[4345]
468,6163,[1332]
469,3732,[670]
470,3335,[202]


# Data preprocess for AI

### For each user, compute their average watch_ratio and the average of the duration of the videos they watch

In [9]:
data_avg_per_user = utils.compute_averages(big_matrix_cleaned)

# Check Dataframe
data_avg_per_user

Unnamed: 0,user_id,user_watch_ratio_mean,user_video_length_mean
0,0,0.981752,-0.136428
1,1,0.902738,-0.064269
2,2,0.631369,0.017857
3,3,0.917419,-0.159442
4,4,0.807428,0.119497
...,...,...,...
6894,7171,0.793249,0.053783
6895,7172,0.951209,-0.040759
6896,7173,0.700544,0.108225
6897,7174,0.795941,-0.046276


### Merge previously calculated data and friend lists to small_matrix

In [10]:
enhanced_data = utils.merge_matrix_avgs_friend_list(big_matrix_cleaned, data_avg_per_user, social_network_cleaned)

# Check Dataframe
enhanced_data

Unnamed: 0,user_id,video_id,watch_ratio,like,video_length,time,user_watch_ratio_mean,user_video_length_mean,friend_list
0,0,3649,1.273397,1,0,2020-07-05 00:08:23.438,0.981752,-0.136428,[]
1,0,5262,0.107613,-1,0,2020-07-05 00:16:06.687,0.981752,-0.136428,[]
2,0,1963,0.089885,-1,0,2020-07-05 00:20:26.792,0.981752,-0.136428,[]
3,0,8234,0.078000,-1,0,2020-07-05 00:43:05.128,0.981752,-0.136428,[]
4,0,8228,1.572295,1,0,2020-07-05 01:00:25.500,0.981752,-0.136428,[]
...,...,...,...,...,...,...,...,...,...
9727433,7175,6597,1.004462,0,0,2020-09-05 06:35:01.104,0.790256,0.006378,[]
9727434,7175,6630,0.313389,-1,0,2020-09-05 15:00:33.379,0.790256,0.006378,[]
9727435,7175,10360,0.340597,-1,0,2020-09-05 19:10:29.041,0.790256,0.006378,[]
9727436,7175,10360,0.913400,0,0,2020-09-05 19:10:36.995,0.790256,0.006378,[]


In [11]:
avg_user_category = big_matrix_cleaned[["user_id", "video_id", "watch_ratio"]].copy()

avg_user_category = avg_user_category.merge(caption_category_cleaned, on="video_id", how="left")

avg_user_category['user_category_watch_ratio_mean'] = avg_user_category.groupby(['user_id', 'first_level_category_id'])['watch_ratio'].transform('mean').fillna(0)

white_list = ["user_id", "first_level_category_id", "user_category_watch_ratio_mean"]

avg_user_category = avg_user_category[white_list].drop_duplicates()

avg_user_category = avg_user_category.sort_values(["user_id", "first_level_category_id"]).reset_index(drop=True)

avg_user_category

Unnamed: 0,user_id,first_level_category_id,user_category_watch_ratio_mean
0,0,1,0.984820
1,0,2,0.824647
2,0,3,0.949826
3,0,4,0.921281
4,0,5,1.022317
...,...,...,...
251567,7175,35,1.250323
251568,7175,36,0.583211
251569,7175,37,0.576903
251570,7175,38,0.319926


In [12]:
avg_user_feat = big_matrix_cleaned[["user_id", "video_id", "watch_ratio"]].copy()

avg_user_feat = avg_user_feat.merge(item_categories_cleaned, on="video_id", how="left")

avg_user_feat['user_feat_watch_ratio_mean'] = avg_user_feat.groupby(['user_id', 'feat'])['watch_ratio'].transform('mean').fillna(0)

white_list = ["user_id", "feat", "user_feat_watch_ratio_mean"]

avg_user_feat = avg_user_feat[white_list].drop_duplicates()

avg_user_feat = avg_user_feat.sort_values(["user_id", "feat"]).reset_index(drop=True)

avg_user_feat

Unnamed: 0,user_id,feat,user_feat_watch_ratio_mean
0,0,0,1.053782
1,0,1,0.914152
2,0,2,0.863959
3,0,3,0.902536
4,0,4,0.762653
...,...,...,...
198842,7175,26,0.836324
198843,7175,27,0.369725
198844,7175,28,0.889533
198845,7175,29,1.380674


In [13]:
final_data = utils.get_friends_watch_ratio(enhanced_data, item_categories_cleaned)

video_feat_category_avg_per_user = utils.get_video_feat_category_avg_per_user(final_data, avg_user_feat, avg_user_category, item_categories_cleaned, caption_category_cleaned)

final_data = final_data.merge(video_feat_category_avg_per_user, on=["user_id", "video_id"], how="left")

# Check Dataframe
final_data

(9727438, 2)
(8690863, 5)


Unnamed: 0,user_id,video_id,watch_ratio,like,video_length,time,user_watch_ratio_mean,user_video_length_mean,friend_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,0,3649,1.273397,1,0,2020-07-05 00:08:23.438,0.981752,-0.136428,,0.784641,0.954082,1.029649
1,0,5262,0.107613,-1,0,2020-07-05 00:16:06.687,0.981752,-0.136428,,0.992569,0.975717,1.022317
2,0,1963,0.089885,-1,0,2020-07-05 00:20:26.792,0.981752,-0.136428,,0.977060,0.998407,1.029649
3,0,8234,0.078000,-1,0,2020-07-05 00:43:05.128,0.981752,-0.136428,,0.781283,0.770139,0.804945
4,0,8228,1.572295,1,0,2020-07-05 01:00:25.500,0.981752,-0.136428,,0.971532,0.987143,0.937071
...,...,...,...,...,...,...,...,...,...,...,...,...
9727433,7175,6597,1.004462,0,0,2020-09-05 06:35:01.104,0.790256,0.006378,,0.840896,0.792753,0.821723
9727434,7175,6630,0.313389,-1,0,2020-09-05 15:00:33.379,0.790256,0.006378,,0.706640,0.889533,0.865700
9727435,7175,10360,0.340597,-1,0,2020-09-05 19:10:29.041,0.790256,0.006378,,1.074987,0.792753,0.821723
9727436,7175,10360,0.913400,0,0,2020-09-05 19:10:36.995,0.790256,0.006378,,1.074987,0.792753,0.821723


In [17]:
final_data["friend_watch_ratio_mean"].isna().sum() / final_data["friend_watch_ratio_mean"].shape[0]

np.float64(0.9670379806070211)

### Drop column because there is 96% missing data

In [18]:
final_data = final_data.drop(columns="friend_watch_ratio_mean")

In [19]:
final_data.to_parquet(INTERMEDIATE_SAVE_PATH + "final_data.parquet", index=True)
avg_user_category.to_parquet(FEATURES_PATH + "avg_user_category.parquet", index=True)
avg_user_feat.to_parquet(FEATURES_PATH + "avg_user_feat.parquet", index=True)