## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
from pathlib import Path
import utils
#from utils import compute_averages, merge_matrix_avgs_friend_list, get_friends_watch_ratio

# Prepare environment

In [2]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

CLEANED_DATA_PATH = "data/cleaned/"
FEATURES_PATH = "data/features/"
Path(FEATURES_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [3]:
big_matrix_final = pd.read_parquet(FEATURES_PATH + "big_matrix_final.parquet")
avg_user_feat = pd.read_parquet(FEATURES_PATH + "avg_user_feat.parquet")
avg_user_category = pd.read_parquet(FEATURES_PATH + "avg_user_category.parquet")

small_matrix_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "small_matrix_cleaned.parquet")
user_features_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "user_features_cleaned.parquet")
social_network_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")
item_categories_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "item_categories_cleaned.parquet")
caption_category_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "caption_category_cleaned.parquet")

# Check Dataframes

In [4]:
small_matrix_cleaned

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length
0,14,148,0.722103,2020-07-05 05:27:48.378,0,-1
1,14,183,1.907377,2020-07-05 05:28:00.057,1,-1
2,14,3649,2.063311,2020-07-05 05:29:09.479,1,0
3,14,5262,0.566388,2020-07-05 05:30:43.285,-1,0
4,14,8234,0.418364,2020-07-05 05:35:43.459,-1,0
...,...,...,...,...,...,...
4676370,7162,9177,0.142857,2020-09-01 20:06:35.984,-1,1
4676371,7162,4987,1.234848,2020-09-02 14:44:51.342,1,0
4676372,7162,7988,1.024412,2020-09-03 08:45:01.474,0,1
4676373,7162,6533,0.273750,2020-09-04 22:56:32.021,-1,0


In [5]:
big_matrix_final

Unnamed: 0,user_id,video_id,watch_ratio,like,video_length,user_watch_ratio_mean,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,0,3649,1.273397,1,0,0.981752,0.784641,0.954082,1.029649
1,0,5262,0.107613,-1,0,0.981752,0.992569,0.975717,1.022317
2,0,1963,0.089885,-1,0,0.981752,0.977060,0.998407,1.029649
3,0,8234,0.078000,-1,0,0.981752,0.781283,0.770139,0.804945
4,0,8228,1.572295,1,0,0.981752,0.971532,0.987143,0.937071
...,...,...,...,...,...,...,...,...,...
9727433,7175,6597,1.004462,0,0,0.790256,0.840896,0.792753,0.821723
9727434,7175,6630,0.313389,-1,0,0.790256,0.706640,0.889533,0.865700
9727435,7175,10360,0.340597,-1,0,0.790256,1.074987,0.792753,0.821723
9727436,7175,10360,0.913400,0,0,0.790256,1.074987,0.792753,0.821723


In [6]:
user_features_cleaned

Unnamed: 0,user_id,user_active_degree,is_lowactive_period,is_live_streamer,is_video_author,follow_user_num,follow_user_num_range,fans_user_num,fans_user_num_range,friend_user_num,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,0,0,0,0,0,5,0,0,0,0,...,0.085546,1.000000,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,1,0,0,0,386,1,4,1,2,...,0.097345,1.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2,1,0,0,0,27,2,0,0,0,...,-0.699115,-0.333333,0.5,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,3,1,0,0,0,16,2,0,0,0,...,0.480826,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,4,1,0,0,0,122,3,4,1,0,...,-0.415929,0.333333,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7171,7171,1,0,0,1,52,5,1,1,0,...,0.528024,-0.666667,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7172,7172,1,0,0,0,45,2,2,1,2,...,-0.935103,-0.333333,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7173,7173,1,0,0,0,615,4,3,1,2,...,-0.699115,-0.333333,0.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7174,7174,1,0,0,0,959,4,0,0,0,...,-0.368732,0.000000,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [7]:
social_network_cleaned

Unnamed: 0,user_id,friend_list
0,3371,[2975]
1,24,[2665]
2,4402,[38]
3,4295,[4694]
4,7087,[7117]
...,...,...
467,2331,[4345]
468,6163,[1332]
469,3732,[670]
470,3335,[202]


In [8]:
item_categories_cleaned

Unnamed: 0,video_id,feat,video_watch_ratio_mean
0,0,8,1.344039
2,2,9,1.163905
4,4,5,0.617457
5,5,6,0.862989
6,6,19,1.483494
...,...,...,...
10722,10722,5,1.167776
10723,10723,11,1.411720
10724,10724,2,1.450592
10726,10726,19,1.810170


# Data preprocess for AI

### Final data cleanup and merging of all user features

In [9]:
# Copy Dataframe
final_data = small_matrix_cleaned.copy()

video_feat_category_avg_per_user = utils.get_video_feat_category_avg_per_user(final_data, avg_user_feat, avg_user_category, item_categories_cleaned, caption_category_cleaned)

final_data = final_data.merge(video_feat_category_avg_per_user, on=["user_id", "video_id"], how="left")

final_data["user_feat_watch_ratio_mean"] = final_data["user_feat_watch_ratio_mean"].fillna(0)
final_data["user_category_watch_ratio_mean"] = final_data["user_category_watch_ratio_mean"].fillna(0)

final_data

Unnamed: 0,user_id,video_id,watch_ratio,time,like,video_length,video_watch_ratio_mean,user_feat_watch_ratio_mean,user_category_watch_ratio_mean
0,14,148,0.722103,2020-07-05 05:27:48.378,0,-1,1.274520,1.169903,1.515494
1,14,183,1.907377,2020-07-05 05:28:00.057,1,-1,1.318898,0.936984,0.962073
2,14,3649,2.063311,2020-07-05 05:29:09.479,1,0,0.784641,0.715448,0.962073
3,14,5262,0.566388,2020-07-05 05:30:43.285,-1,0,0.992569,0.840792,1.245391
4,14,8234,0.418364,2020-07-05 05:35:43.459,-1,0,0.781283,0.921278,1.204511
...,...,...,...,...,...,...,...,...,...
3857008,7162,9177,0.142857,2020-09-01 20:06:35.984,-1,1,0.219228,1.347908,1.347908
3857009,7162,4987,1.234848,2020-09-02 14:44:51.342,1,0,1.109515,0.972297,0.912121
3857010,7162,7988,1.024412,2020-09-03 08:45:01.474,0,1,0.202936,1.347908,1.347908
3857011,7162,6533,0.273750,2020-09-04 22:56:32.021,-1,0,1.062392,0.972297,0.912121


In [10]:
# Merge user calculated data from big_matrix
user_big_matrix_final = big_matrix_final[["user_id", "user_watch_ratio_mean"]].drop_duplicates()
user_big_matrix_final = user_big_matrix_final[user_big_matrix_final["user_id"].isin(final_data["user_id"])].drop_duplicates()
final_data = final_data.merge(user_big_matrix_final, on="user_id", how="left")

In [11]:
final_data.isna().sum()

user_id                           0
video_id                          0
watch_ratio                       0
time                              0
like                              0
video_length                      0
video_watch_ratio_mean            0
user_feat_watch_ratio_mean        0
user_category_watch_ratio_mean    0
user_watch_ratio_mean             0
dtype: int64

## Only keep meaningful features

### Get correlation between watch_ratio and every other feature

In [12]:
corr = final_data.corr()["watch_ratio"]
corr

user_id                          -0.004562
video_id                          0.002244
watch_ratio                       1.000000
time                             -0.007629
like                              0.831327
video_length                     -0.446454
video_watch_ratio_mean            0.520128
user_feat_watch_ratio_mean        0.166039
user_category_watch_ratio_mean    0.133929
user_watch_ratio_mean             0.161660
Name: watch_ratio, dtype: float64

### See which features are above set threshold

In [13]:
corr[abs(corr) > CORR_THRESHOLD]

watch_ratio               1.000000
like                      0.831327
video_length             -0.446454
video_watch_ratio_mean    0.520128
Name: watch_ratio, dtype: float64

### Only keep previously selected features as well as useful ones for later use

In [14]:
final_data = final_data[big_matrix_final.columns]

# Save Data for AI

In [15]:
final_data.to_parquet(FEATURES_PATH + "small_matrix_final.parquet", index=True)