## Import necessary libraries

In [1]:
# Data imports
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
from pathlib import Path
import utils

# Prepare environment

In [2]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

CLEANED_DATA_PATH = "data/cleaned/"

# Used to save intermediate versions of big_matrix
INTERMEDIATE_SAVE_PATH = "data/big_matrix_intermediate_saves/"
Path(INTERMEDIATE_SAVE_PATH).mkdir(parents=True, exist_ok=True)

FEATURES_PATH = "data/features/"
Path(FEATURES_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [3]:
big_matrix_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "big_matrix_cleaned.parquet")
user_features_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "user_features_cleaned.parquet")
item_categories_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "item_categories_cleaned.parquet")
caption_category_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "caption_category_cleaned.parquet")
social_network_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")

# Check Data

In [4]:
big_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,video_duration
0,0,3649,1.273397,10867
2,0,5262,0.107613,7908
3,0,1963,1.434307,9590
4,0,8234,1.296455,11000
5,0,8228,3.113806,8576
...,...,...,...,...
12530798,7175,1776,0.174968,17809
12530799,7175,6597,1.004462,8741
12530800,7175,6630,0.313389,13855
12530803,7175,10360,1.253997,7067


In [5]:
user_features_cleaned

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,0,0,0,0,5,0,0,0,0,...,0.085546,1.000000,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1,0,0,0,386,1,4,1,2,...,0.097345,1.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,1,0,0,0,27,2,0,0,0,...,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,1,0,0,0,16,2,0,0,0,...,0.480826,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,1,0,0,0,122,3,4,1,0,...,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,1,0,0,1,52,5,1,1,0,...,0.528024,-0.666667,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7172,7172,1,0,0,0,45,2,2,1,2,...,-0.935103,-0.333333,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7173,7173,1,0,0,0,615,4,3,1,2,...,-0.699115,-0.333333,0.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7174,7174,1,0,0,0,959,4,0,0,0,...,-0.368732,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [6]:
item_categories_cleaned

Unnamed: 0,video_id,feat,video_watch_ratio_mean
0,0,8,1.369886
2,2,9,1.173445
4,4,5,3.087283
5,5,6,1.294484
6,6,19,1.483494
...,...,...,...
10722,10722,5,1.167776
10723,10723,11,1.411720
10724,10724,2,1.499461
10726,10726,19,1.810170


In [7]:
caption_category_cleaned

Unnamed: 0,video_id,first_level_category_id
0,0,8
2,2,9
3,3,26
4,4,5
5,5,6
...,...,...
10722,10722,5
10723,10723,33
10724,10724,6
10726,10726,38


In [8]:
social_network_cleaned

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]
...,...,...
467,2331,[4345]
468,6163,[1332]
469,3732,[670]
470,3335,[202]


# Data features computing

### For each user, compute their average watch_ratio and the average of the duration of the videos they watch

In [9]:
data_avg_per_user = utils.compute_averages(big_matrix_cleaned)

# Check Dataframe
data_avg_per_user

Unnamed: 0,user_id,user_watch_ratio_mean,user_video_duration_mean
0,0,1.087166,10665.762908
1,1,1.020260,13461.978317
2,2,0.658659,13827.961494
3,3,1.037487,11593.344668
4,4,0.855874,14573.280000
...,...,...,...
6894,7171,0.843794,15453.778875
6895,7172,1.042669,13753.109974
6896,7173,0.738524,16604.006289
6897,7174,0.835123,13187.083143


### Merge previously calculated data to small_matrix

In [10]:
# Copy useful columns from Dataframe and fix time format
enhanced_data = big_matrix_cleaned.copy()

# Merge previously calculated averages
enhanced_data = enhanced_data.merge(data_avg_per_user, on="user_id", how="left")

# Check Dataframe
enhanced_data

Unnamed: 0,user_id,video_id,watch_ratio,video_duration,user_watch_ratio_mean,user_video_duration_mean
0,0,3649,1.273397,10867,1.087166,10665.762908
1,0,5262,0.107613,7908,1.087166,10665.762908
2,0,1963,1.434307,9590,1.087166,10665.762908
3,0,8234,1.296455,11000,1.087166,10665.762908
4,0,8228,3.113806,8576,1.087166,10665.762908
...,...,...,...,...,...,...
8591372,7175,1776,0.174968,17809,0.928371,14066.927969
8591373,7175,6597,1.004462,8741,0.928371,14066.927969
8591374,7175,6630,0.313389,13855,0.928371,14066.927969
8591375,7175,10360,1.253997,7067,0.928371,14066.927969


### For each user, compute their average watch_ratio per first_level_category

In [11]:
# Copy dataframe
avg_user_category = big_matrix_cleaned[["user_id", "video_id", "watch_ratio"]].copy()

# Merge first level caterory id to dataframe
avg_user_category = avg_user_category.merge(caption_category_cleaned, on="video_id", how="left")

# Get average watch ratio per level_category per user
avg_user_category['user_category_watch_ratio_mean'] = avg_user_category.groupby(['user_id', 'first_level_category_id'])['watch_ratio'].transform('mean').fillna(0)

# Drop duplicates
white_list = ["user_id", "first_level_category_id", "user_category_watch_ratio_mean"]
avg_user_category = avg_user_category[white_list].drop_duplicates()

# Clean dataframe format to be more easily readable
avg_user_category = avg_user_category.sort_values(["user_id", "first_level_category_id"]).reset_index(drop=True)

# Check dataframe
avg_user_category

Unnamed: 0,user_id,first_level_category_id,user_category_watch_ratio_mean
0,0,1,1.120988
1,0,2,0.989576
2,0,3,0.949826
3,0,4,0.967345
4,0,5,1.023009
...,...,...,...
251344,7175,35,1.597635
251345,7175,36,0.583211
251346,7175,37,0.576903
251347,7175,38,0.479889


### For each user, compute their average watch_ratio per video feat

In [12]:
# Copy dataframe
avg_user_feat = big_matrix_cleaned[["user_id", "video_id", "watch_ratio"]].copy()

# Merge feat to dataframe
avg_user_feat = avg_user_feat.merge(item_categories_cleaned, on="video_id", how="left")

# Get average watch ratio per feat per user
avg_user_feat['user_feat_watch_ratio_mean'] = avg_user_feat.groupby(['user_id', 'feat'])['watch_ratio'].transform('mean').fillna(0)

# Drop duplicates
white_list = ["user_id", "feat", "user_feat_watch_ratio_mean"]
avg_user_feat = avg_user_feat[white_list].drop_duplicates()

# Clean dataframe format to be more easily readable
avg_user_feat = avg_user_feat.sort_values(["user_id", "feat"]).reset_index(drop=True)

#Check dataframe
avg_user_feat

Unnamed: 0,user_id,feat,user_feat_watch_ratio_mean
0,0,0,1.204322
1,0,1,1.008130
2,0,2,0.903229
3,0,3,0.902536
4,0,4,0.791986
...,...,...,...
198661,7175,26,0.989632
198662,7175,27,0.492967
198663,7175,28,1.020568
198664,7175,29,1.380674


### Merge all data together

In [13]:
# Copy dataframe
final_data = enhanced_data.copy()

# Call function to merge every calculated data
video_feat_category_avg_per_user = utils.get_video_feat_category_avg_per_user(final_data, avg_user_feat, avg_user_category, item_categories_cleaned, caption_category_cleaned)

# Merge calculated data to main dataframe
final_data = final_data.merge(video_feat_category_avg_per_user, on=["user_id", "video_id"], how="left")

# Check Dataframe
final_data

Unnamed: 0,user_id,video_id,watch_ratio,video_duration,user_watch_ratio_mean,user_video_duration_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,0,3649,1.273397,10867,1.087166,10665.762908,0.888587,1.064650,1.123552
1,0,5262,0.107613,7908,1.087166,10665.762908,1.155057,1.087868,1.023009
2,0,1963,1.434307,9590,1.087166,10665.762908,1.056692,1.102082,1.123552
3,0,8234,1.296455,11000,1.087166,10665.762908,0.878387,0.887297,0.944791
4,0,8228,3.113806,8576,1.087166,10665.762908,1.189393,1.092934,1.072425
...,...,...,...,...,...,...,...,...,...
8591372,7175,1776,0.174968,17809,0.928371,14066.927969,0.756543,1.003333,1.105059
8591373,7175,6597,1.004462,8741,0.928371,14066.927969,0.869321,0.926486,0.967923
8591374,7175,6630,0.313389,13855,0.928371,14066.927969,0.732901,1.020568,0.988005
8591375,7175,10360,1.253997,7067,0.928371,14066.927969,1.138773,0.926486,0.967923


# Save Data

In [14]:
final_data.to_parquet(INTERMEDIATE_SAVE_PATH + "final_data.parquet", index=True)
avg_user_category.to_parquet(FEATURES_PATH + "avg_user_category.parquet", index=True)
avg_user_feat.to_parquet(FEATURES_PATH + "avg_user_feat.parquet", index=True)