## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
from pathlib import Path
import utils
#from utils import compute_averages, merge_matrix_avgs_friend_list, get_friends_watch_ratio

# Prepare environment

In [2]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

CLEANED_DATA_PATH = "data/cleaned/"
FEATURES_PATH = "data/features/"
Path(FEATURES_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [3]:
big_matrix_final = pd.read_parquet(FEATURES_PATH + "big_matrix_final.parquet")
avg_user_feat = pd.read_parquet(FEATURES_PATH + "avg_user_feat.parquet")
avg_user_category = pd.read_parquet(FEATURES_PATH + "avg_user_category.parquet")

small_matrix_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "small_matrix_cleaned.parquet")
user_features_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "user_features_cleaned.parquet")
social_network_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")
item_categories_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "item_categories_cleaned.parquet")
caption_category_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "caption_category_cleaned.parquet")

# Check Dataframes

In [4]:
small_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,video_duration
0,14,148,0.722103,6067
1,14,183,1.907377,6100
2,14,3649,2.063311,10867
3,14,5262,0.566388,7908
4,14,8234,0.418364,11000
...,...,...,...,...
4676370,7162,9177,0.142857,37205
4676371,7162,4987,1.234848,8167
4676372,7162,7988,1.024412,49319
4676373,7162,6533,0.273750,8000


In [5]:
big_matrix_final

Unnamed: 0,user_id,video_id,watch_ratio,video_duration,user_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,0,3649,1.273397,10867,1.087166,0.888587,1.064650,1.123552
1,0,5262,0.107613,7908,1.087166,1.155057,1.087868,1.023009
2,0,1963,1.434307,9590,1.087166,1.056692,1.102082,1.123552
3,0,8234,1.296455,11000,1.087166,0.878387,0.887297,0.944791
4,0,8228,3.113806,8576,1.087166,1.189393,1.092934,1.072425
...,...,...,...,...,...,...,...,...
8591372,7175,1776,0.174968,17809,0.928371,0.756543,1.003333,1.105059
8591373,7175,6597,1.004462,8741,0.928371,0.869321,0.926486,0.967923
8591374,7175,6630,0.313389,13855,0.928371,0.732901,1.020568,0.988005
8591375,7175,10360,1.253997,7067,0.928371,1.138773,0.926486,0.967923


In [6]:
user_features_cleaned

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,0,0,0,0,5,0,0,0,0,...,0.085546,1.000000,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1,0,0,0,386,1,4,1,2,...,0.097345,1.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,1,0,0,0,27,2,0,0,0,...,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,1,0,0,0,16,2,0,0,0,...,0.480826,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,1,0,0,0,122,3,4,1,0,...,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,1,0,0,1,52,5,1,1,0,...,0.528024,-0.666667,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7172,7172,1,0,0,0,45,2,2,1,2,...,-0.935103,-0.333333,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7173,7173,1,0,0,0,615,4,3,1,2,...,-0.699115,-0.333333,0.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7174,7174,1,0,0,0,959,4,0,0,0,...,-0.368732,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [7]:
social_network_cleaned

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]
...,...,...
467,2331,[4345]
468,6163,[1332]
469,3732,[670]
470,3335,[202]


In [8]:
item_categories_cleaned

Unnamed: 0,video_id,feat,video_watch_ratio_mean
0,0,8,1.369886
2,2,9,1.173445
4,4,5,3.087283
5,5,6,1.294484
6,6,19,1.483494
...,...,...,...
10722,10722,5,1.167776
10723,10723,11,1.411720
10724,10724,2,1.499461
10726,10726,19,1.810170


# Data preprocess for AI

### Merge all to have the same shape and information as big_matrix

In [9]:
# Copy Dataframe
final_data = small_matrix_cleaned.copy()

# Call function to merge every calculated data
video_feat_category_avg_per_user = utils.get_video_feat_category_avg_per_user(final_data, avg_user_feat, avg_user_category, item_categories_cleaned, caption_category_cleaned)

# Merge calculated data to main dataframe
final_data = final_data.merge(video_feat_category_avg_per_user, on=["user_id", "video_id"], how="left")

# Fill unknown average values with 0
final_data["user_feat_watch_ratio_mean"] = final_data["user_feat_watch_ratio_mean"].fillna(0)
final_data["user_category_watch_ratio_mean"] = final_data["user_category_watch_ratio_mean"].fillna(0)

# Check Dataframe
final_data

Unnamed: 0,user_id,video_id,watch_ratio,video_duration,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,14,148,0.722103,6067,1.404884,1.189430,1.515494
1,14,183,1.907377,6100,1.496601,0.936984,0.962073
2,14,3649,2.063311,10867,0.888587,0.747969,0.962073
3,14,5262,0.566388,7908,1.155057,0.893341,1.349173
4,14,8234,0.418364,11000,0.878387,0.921278,1.204511
...,...,...,...,...,...,...,...
3830597,7162,9177,0.142857,37205,0.232026,1.347908,1.347908
3830598,7162,4987,1.234848,8167,1.152388,0.972297,0.912121
3830599,7162,7988,1.024412,49319,0.214027,1.347908,1.347908
3830600,7162,6533,0.273750,8000,1.110730,0.972297,0.912121


In [10]:
# Merge user calculated data from big_matrix
user_big_matrix_final = big_matrix_final[["user_id", "user_watch_ratio_mean"]].drop_duplicates()
user_big_matrix_final = user_big_matrix_final[user_big_matrix_final["user_id"].isin(final_data["user_id"])].drop_duplicates()
final_data = final_data.merge(user_big_matrix_final, on="user_id", how="left")

In [11]:
final_data.isna().sum()

user_id                           0
video_id                          0
watch_ratio                       0
video_duration                    0
video_watch_ratio_mean            0
user_feat_watch_ratio_mean        0
user_category_watch_ratio_mean    0
user_watch_ratio_mean             0
dtype: int64

## Only keep meaningful features

### Only keep previously selected features as well as useful ones for later use

In [12]:
final_data = final_data[big_matrix_final.columns]

# Save Data

In [13]:
final_data.to_parquet(FEATURES_PATH + "small_matrix_final.parquet", index=True)