## Import necessary libraries

In [None]:
# Data imports
import numpy as np
import pandas as pd
from pathlib import Path
from utils import compute_averages, merge_matrix_avgs_friend_list, get_friends_watch_ratio

# Prepare environment

In [None]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

FEATURES_PATH = "data/features/"
Path(FEATURES_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [2]:
small_matrix_cleaned = pd.read_parquet("data/small_matrix_cleaned.parquet")
user_features = pd.read_parquet("data/user_features.parquet")
social_network_cleaned = pd.read_parquet("data/social_network_cleaned.parquet")

# Data preprocess for AI

### For each user, compute their average watch_ratio and the average of the duration of the videos they watch

In [None]:
data_avg_per_user = compute_averages(small_matrix_cleaned)

# Check Dataframe
data_avg_per_user

Unnamed: 0,user_id,watch_ratio_mean,video_length_mean,hour
0,14,0.969348,-0.101666,9.684247
1,19,0.871409,-0.103580,12.129301
2,21,0.957661,-0.099688,11.624784
3,23,0.933149,-0.092044,13.066714
4,24,0.808875,-0.096466,12.034276
...,...,...,...,...
1349,7142,0.824166,-0.099370,6.089223
1350,7147,0.978413,-0.115047,4.773413
1351,7153,0.797826,-0.098170,13.257213
1352,7159,0.792772,-0.098495,15.008892


### Merge previously calculated data and friend lists to small_matrix

In [None]:
enhanced_data = merge_matrix_avgs_friend_list(small_matrix_cleaned, data_avg_per_user, social_network_cleaned)

# Check Dataframe
enhanced_data

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,friend_list
0,14,148,0.722103,-1,2020-07-05 05:27:48.378,0.969348,-0.101666,9.684247,[]
1,14,183,1.907377,-1,2020-07-05 05:28:00.057,0.969348,-0.101666,9.684247,[]
2,14,3649,2.063311,0,2020-07-05 05:29:09.479,0.969348,-0.101666,9.684247,[]
3,14,5262,0.566388,0,2020-07-05 05:30:43.285,0.969348,-0.101666,9.684247,[]
4,14,8234,0.418364,0,2020-07-05 05:35:43.459,0.969348,-0.101666,9.684247,[]
...,...,...,...,...,...,...,...,...,...
3857008,7162,9177,0.142857,1,2020-09-01 20:06:35.984,1.123807,-0.114988,7.869349,[]
3857009,7162,4987,1.234848,0,2020-09-02 14:44:51.342,1.123807,-0.114988,7.869349,[]
3857010,7162,7988,1.024412,1,2020-09-03 08:45:01.474,1.123807,-0.114988,7.869349,[]
3857011,7162,6533,0.273750,0,2020-09-04 22:56:32.021,1.123807,-0.114988,7.869349,[]


### Get friends' average watch ratio for current interaction's video

In [None]:
final_data = get_friends_watch_ratio(enhanced_data)

# Check Dataframe
final_data

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,watch_ratio_prior_mean
0,6190,9559,1.981442,0,2020-07-04 02:23:26.060,1.037910,-0.104526,9.770474,0.000000
1,6190,9553,0.964642,0,2020-07-04 04:03:18.888,1.037910,-0.104526,9.770474,0.000000
2,6190,9530,0.112044,0,2020-07-04 04:03:53.725,1.037910,-0.104526,9.770474,0.000000
3,6190,8176,0.700000,0,2020-07-04 04:35:24.528,1.037910,-0.104526,9.770474,0.000000
4,6190,8189,0.906852,0,2020-07-04 06:32:23.949,1.037910,-0.104526,9.770474,0.000000
...,...,...,...,...,...,...,...,...,...
3857008,4766,10291,0.171522,1,2020-09-05 23:52:40.419,0.937582,-0.098420,11.584411,0.273859
3857009,6682,3151,0.612817,0,2020-09-05 23:52:56.230,0.759180,-0.105393,9.305252,0.892070
3857010,6139,9112,0.451241,0,2020-09-05 23:53:50.831,0.724104,-0.104069,8.974073,0.826597
3857011,5450,9162,0.871151,0,2020-09-05 23:57:15.282,0.711956,-0.103424,12.812369,0.989261


### Final data cleanup and merging of all user features

In [6]:
# Copy Dataframe
final_data = final_data.copy()

# Merge all user features
final_data = final_data.merge(user_features, on="user_id", how="left")

# Encode columns to be easier to work with
final_data['user_active_degree'] = pd.factorize(final_data['user_active_degree'])[0]
final_data['follow_user_num_range'] = pd.factorize(final_data['follow_user_num_range'])[0]
final_data['fans_user_num_range'] = pd.factorize(final_data['fans_user_num_range'])[0]
final_data['friend_user_num_range'] = pd.factorize(final_data['friend_user_num_range'])[0]
final_data['register_days_range'] = pd.factorize(final_data['register_days_range'])[0]

final_data

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,watch_ratio_prior_mean,user_active_degree,...,onehot_feat8,onehot_feat9,onehot_feat10,onehot_feat11,onehot_feat12,onehot_feat13,onehot_feat14,onehot_feat15,onehot_feat16,onehot_feat17
0,6190,9559,1.981442,0,2020-07-04 02:23:26.060,1.037910,-0.104526,9.770474,0.000000,0,...,292,4,3,0,1.0,0.0,0.0,0.0,0.0,0.0
1,6190,9553,0.964642,0,2020-07-04 04:03:18.888,1.037910,-0.104526,9.770474,0.000000,0,...,292,4,3,0,1.0,0.0,0.0,0.0,0.0,0.0
2,6190,9530,0.112044,0,2020-07-04 04:03:53.725,1.037910,-0.104526,9.770474,0.000000,0,...,292,4,3,0,1.0,0.0,0.0,0.0,0.0,0.0
3,6190,8176,0.700000,0,2020-07-04 04:35:24.528,1.037910,-0.104526,9.770474,0.000000,0,...,292,4,3,0,1.0,0.0,0.0,0.0,0.0,0.0
4,6190,8189,0.906852,0,2020-07-04 06:32:23.949,1.037910,-0.104526,9.770474,0.000000,0,...,292,4,3,0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3857008,4766,10291,0.171522,1,2020-09-05 23:52:40.419,0.937582,-0.098420,11.584411,0.273859,0,...,292,4,2,0,0.0,0.0,0.0,0.0,0.0,0.0
3857009,6682,3151,0.612817,0,2020-09-05 23:52:56.230,0.759180,-0.105393,9.305252,0.892070,2,...,194,6,2,0,1.0,0.0,0.0,0.0,0.0,0.0
3857010,6139,9112,0.451241,0,2020-09-05 23:53:50.831,0.724104,-0.104069,8.974073,0.826597,2,...,14,4,4,0,0.0,0.0,0.0,0.0,0.0,0.0
3857011,5450,9162,0.871151,0,2020-09-05 23:57:15.282,0.711956,-0.103424,12.812369,0.989261,2,...,257,3,0,0,1.0,0.0,0.0,0.0,0.0,0.0


## Only keep meaningful features

### Get correlation between watch_ratio and every other feature

In [7]:
corr = final_data.corr()["watch_ratio"]
corr

user_id                  -0.004562
video_id                  0.002244
watch_ratio               1.000000
video_length             -0.446454
watch_ratio_mean          0.242036
video_length_mean        -0.024745
hour                      0.002531
watch_ratio_prior_mean    0.515066
user_active_degree       -0.009268
is_lowactive_period            NaN
is_live_streamer         -0.010062
is_video_author          -0.011930
follow_user_num          -0.002115
follow_user_num_range     0.000632
fans_user_num            -0.004117
fans_user_num_range      -0.006626
friend_user_num           0.000129
friend_user_num_range    -0.003166
register_days            -0.005658
register_days_range      -0.001318
onehot_feat0             -0.000525
onehot_feat1              0.003763
onehot_feat2              0.009550
onehot_feat3              0.008513
onehot_feat4             -0.009046
onehot_feat5              0.005654
onehot_feat6             -0.017054
onehot_feat7              0.011141
onehot_feat8        

### See which features are above set threshold

In [8]:
corr[corr > CORR_THRESHOLD]

watch_ratio               1.000000
watch_ratio_mean          0.242036
watch_ratio_prior_mean    0.515066
Name: watch_ratio, dtype: float64

### Only keep previously selected features as well as useful ones for later use

In [9]:
final_data = final_data[["time", "user_id", "video_id"] + corr[corr > CORR_THRESHOLD].index.to_list()]

# Save Data for AI

In [None]:
final_data.to_parquet(FEATURES_PATH + "small_matrix_final.parquet", index=True)