## Import necessary libraries

In [1]:
# Data imports
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
from pathlib import Path
from utils import compute_averages, merge_matrix_avgs_friend_list, get_friends_watch_ratio

# Prepare environment

In [2]:
SEED = 42
np.random.seed(SEED)

TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

CLEANED_DATA_PATH = "../data/cleaned/"

INTERMEDIATE_SAVE_PATH = "../data/big_matrix_intermediate_saves/"
Path(INTERMEDIATE_SAVE_PATH).mkdir(parents=True, exist_ok=True)

# Load Data

In [3]:
big_matrix_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "big_matrix_cleaned.parquet")
user_features_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "user_features_cleaned.parquet")
social_network_cleaned = pd.read_parquet(CLEANED_DATA_PATH + "social_network_cleaned.parquet")

# Data preprocess for AI

### For each user, compute their average watch_ratio and the average of the duration of the videos they watch

In [4]:
data_avg_per_user = compute_averages(big_matrix_cleaned)

# Check Dataframe
data_avg_per_user

Unnamed: 0,user_id,watch_ratio_mean,video_length_mean,hour
0,0,0.981752,-0.136428,7.339897
1,1,0.902738,-0.064269,12.411630
2,2,0.631369,0.017857,13.254464
3,3,0.917419,-0.159442,10.832476
4,4,0.807428,0.119497,13.633124
...,...,...,...,...
6894,7171,0.793249,0.053783,11.670009
6895,7172,0.951209,-0.040759,8.659408
6896,7173,0.700544,0.108225,12.525253
6897,7174,0.795941,-0.046276,8.522777


### Merge previously calculated data and friend lists to small_matrix

In [5]:
enhanced_data = merge_matrix_avgs_friend_list(big_matrix_cleaned, data_avg_per_user, social_network_cleaned)

# Check Dataframe
enhanced_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged["time"] = pd.to_datetime(merged["time"])


Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,friend_list
0,0,3649,1.273397,0,2020-07-05 00:08:23.438,0.981752,-0.136428,7.339897,[]
1,0,5262,0.107613,0,2020-07-05 00:16:06.687,0.981752,-0.136428,7.339897,[]
2,0,1963,0.089885,0,2020-07-05 00:20:26.792,0.981752,-0.136428,7.339897,[]
3,0,8234,0.078000,0,2020-07-05 00:43:05.128,0.981752,-0.136428,7.339897,[]
4,0,8228,1.572295,0,2020-07-05 01:00:25.500,0.981752,-0.136428,7.339897,[]
...,...,...,...,...,...,...,...,...,...
9729356,7175,6597,1.004462,0,2020-09-05 06:35:01.104,0.790256,0.006378,12.806760,[]
9729357,7175,6630,0.313389,0,2020-09-05 15:00:33.379,0.790256,0.006378,12.806760,[]
9729358,7175,10360,0.340597,0,2020-09-05 19:10:29.041,0.790256,0.006378,12.806760,[]
9729359,7175,10360,0.913400,0,2020-09-05 19:10:36.995,0.790256,0.006378,12.806760,[]


In [6]:
final_data = get_friends_watch_ratio(enhanced_data)

# Check Dataframe
final_data

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,watch_ratio_prior_mean
0,5928,9541,2.555187,-1,2020-06-24 06:12:22.270,1.109535,-0.179052,9.641897,0.000000
1,2783,3533,0.595189,0,2020-06-25 11:29:44.811,0.938616,-0.002587,11.550453,0.000000
2,2783,9545,1.937264,-1,2020-06-25 11:36:13.188,0.938616,-0.002587,11.550453,0.000000
3,2783,5206,0.606315,0,2020-06-25 12:46:51.034,0.938616,-0.002587,11.550453,0.000000
4,2783,5166,0.796302,0,2020-06-25 13:09:35.018,0.938616,-0.002587,11.550453,0.000000
...,...,...,...,...,...,...,...,...,...
9729356,5702,9041,1.183844,0,2020-09-10 06:43:04.602,0.800965,-0.149668,6.650498,0.986430
9729357,5702,5751,0.197555,0,2020-09-10 06:52:44.823,0.800965,-0.149668,6.650498,0.789548
9729358,5702,4605,1.313025,0,2020-09-10 06:59:22.691,0.800965,-0.149668,6.650498,1.144480
9729359,5702,7043,0.276921,0,2020-09-10 07:28:52.192,0.800965,-0.149668,6.650498,0.908206


In [7]:
final_data.to_parquet(INTERMEDIATE_SAVE_PATH + "final_data.parquet", index=True)