## Import necessary libraries

In [None]:
# Data imports
import numpy as np
import pandas as pd
from pathlib import Path
from utils import compute_averages, merge_matrix_avgs_friend_list, get_friends_watch_ratio

# Prepare environment

In [None]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

CLEANED_DATA_PATH = "../data/cleaned/"

INTERMEDIATE_SAVE_PATH = "../data/big_matrix_intermediate_saves/"
Path(INTERMEDIATE_SAVE_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [None]:
big_matrix_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "big_matrix_cleaned.parquet")
user_features = pd.read_parquet(CLEANED_DATA_PATH + "user_features.parquet")
social_network_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")

# Data preprocess for AI

### For each user, compute their average watch_ratio and the average of the duration of the videos they watch

In [None]:
data_avg_per_user = compute_averages(big_matrix_cleaned)

# Check Dataframe
data_avg_per_user

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,friend_list
0,0,3649,1.273397,0,2020-07-05 00:08:23.438,0.981752,-0.136428,7.339897,[]
1,0,5262,0.107613,0,2020-07-05 00:16:06.687,0.981752,-0.136428,7.339897,[]
2,0,1963,0.089885,0,2020-07-05 00:20:26.792,0.981752,-0.136428,7.339897,[]
3,0,8234,0.078000,0,2020-07-05 00:43:05.128,0.981752,-0.136428,7.339897,[]
4,0,8228,1.572295,0,2020-07-05 01:00:25.500,0.981752,-0.136428,7.339897,[]
...,...,...,...,...,...,...,...,...,...
9729356,7175,6597,1.004462,0,2020-09-05 06:35:01.104,0.790256,0.006378,12.806760,[]
9729357,7175,6630,0.313389,0,2020-09-05 15:00:33.379,0.790256,0.006378,12.806760,[]
9729358,7175,10360,0.340597,0,2020-09-05 19:10:29.041,0.790256,0.006378,12.806760,[]
9729359,7175,10360,0.913400,0,2020-09-05 19:10:36.995,0.790256,0.006378,12.806760,[]


### Merge previously calculated data and friend lists to small_matrix

In [None]:
enhanced_data = merge_matrix_avgs_friend_list(big_matrix_cleaned, data_avg_per_user, social_network_cleaned)

# Check Dataframe
enhanced_data

In [None]:
final_data = get_friends_watch_ratio(enhanced_data)

# Check Dataframe
final_data

In [None]:
final_data.to_parquet(INTERMEDIATE_SAVE_PATH + "final_data.parquet", index=True)