## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
import warnings
import re


SEED = 42
TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

warnings.filterwarnings("ignore")
np.random.seed(SEED)

# Load Data

In [2]:
big_matrix_cleaned = pd.read_parquet("data/big_matrix_cleaned.parquet")
user_features = pd.read_parquet("data/user_features.parquet")
social_network_cleaned = pd.read_parquet("data/social_network_cleaned.parquet")

# Data preprocess for AI

In [3]:
data_avg = big_matrix_cleaned.copy()

data_avg['time'] = pd.to_datetime(data_avg['time'])

data_avg['hour'] = data_avg['time'].dt.hour

data_avg = data_avg.groupby("user_id").mean(numeric_only=True)
data_avg = data_avg.reset_index()
data_avg = data_avg.drop(columns=["video_id", "like"])
data_avg = data_avg.rename(columns={"watch_ratio": "watch_ratio_mean", "video_length": "video_length_mean"})

data_for_ai = big_matrix_cleaned[["user_id", "video_id", "watch_ratio", "video_length", "time"]]
data_for_ai['time'] = pd.to_datetime(data_for_ai['time'])
data_for_ai = data_for_ai.merge(data_avg, on="user_id", how="left")
data_for_ai = data_for_ai.merge(social_network_cleaned, on="user_id", how="left")
data_for_ai["friend_list"] = data_for_ai["friend_list"].apply(
    lambda x: [] if isinstance(x, float) and np.isnan(x) else x
)

data_for_ai

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,friend_list
0,0,3649,1.273397,0,2020-07-05 00:08:23.438,0.981752,-0.136428,7.339897,[]
1,0,5262,0.107613,0,2020-07-05 00:16:06.687,0.981752,-0.136428,7.339897,[]
2,0,1963,0.089885,0,2020-07-05 00:20:26.792,0.981752,-0.136428,7.339897,[]
3,0,8234,0.078000,0,2020-07-05 00:43:05.128,0.981752,-0.136428,7.339897,[]
4,0,8228,1.572295,0,2020-07-05 01:00:25.500,0.981752,-0.136428,7.339897,[]
...,...,...,...,...,...,...,...,...,...
9729356,7175,6597,1.004462,0,2020-09-05 06:35:01.104,0.790256,0.006378,12.806760,[]
9729357,7175,6630,0.313389,0,2020-09-05 15:00:33.379,0.790256,0.006378,12.806760,[]
9729358,7175,10360,0.340597,0,2020-09-05 19:10:29.041,0.790256,0.006378,12.806760,[]
9729359,7175,10360,0.913400,0,2020-09-05 19:10:36.995,0.790256,0.006378,12.806760,[]


In [4]:
df = data_for_ai

# Step 1: Sort by time
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values('time').reset_index(drop=True)

# Step 2: Compute prior video mean (exclude current row using shift)
df['video_watch_ratio_prior_mean'] = (
    df
    .groupby('video_id')['watch_ratio']
    .transform(lambda x: x.shift().expanding().mean())
)

# Step 3: Explode friends
df_exploded = df.explode('friend_list').rename(columns={'friend_list': 'friend_id'})

# Step 4: Join back with historical data for that friend + video (prior to time)
friend_history = df[['user_id', 'video_id', 'time', 'watch_ratio']].copy()
friend_history.columns = ['friend_id', 'video_id', 'friend_time', 'friend_watch_ratio']

# Step 5: Merge and filter only past friend data
merged = df_exploded.merge(friend_history, on=['friend_id', 'video_id'], how='left')
merged = merged[merged['friend_time'] < merged['time']]

# Step 6: Compute friend mean
friend_means = (
    merged
    .groupby(['user_id', 'video_id', 'time'])['friend_watch_ratio']
    .mean()
    .reset_index()
    .rename(columns={'friend_watch_ratio': 'friend_watch_ratio_prior_mean'})
)

# Step 7: Merge back friend means and fallback to video means
df = df.merge(friend_means, on=['user_id', 'video_id', 'time'], how='left')
df['watch_ratio_prior_mean'] = df['friend_watch_ratio_prior_mean'].fillna(df['video_watch_ratio_prior_mean'])

# Optional cleanup
df = df.drop(columns=['friend_watch_ratio_prior_mean', 'video_watch_ratio_prior_mean', 'friend_list'])


In [5]:
df

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,watch_ratio_prior_mean
0,5928,9541,2.555187,-1,2020-06-24 06:12:22.270,1.109535,-0.179052,9.641897,
1,2783,3533,0.595189,0,2020-06-25 11:29:44.811,0.938616,-0.002587,11.550453,
2,2783,9545,1.937264,-1,2020-06-25 11:36:13.188,0.938616,-0.002587,11.550453,
3,2783,5206,0.606315,0,2020-06-25 12:46:51.034,0.938616,-0.002587,11.550453,
4,2783,5166,0.796302,0,2020-06-25 13:09:35.018,0.938616,-0.002587,11.550453,
...,...,...,...,...,...,...,...,...,...
9729356,5702,9041,1.183844,0,2020-09-10 06:43:04.602,0.800965,-0.149668,6.650498,0.986430
9729357,5702,5751,0.197555,0,2020-09-10 06:52:44.823,0.800965,-0.149668,6.650498,0.789548
9729358,5702,4605,1.313025,0,2020-09-10 06:59:22.691,0.800965,-0.149668,6.650498,1.144480
9729359,5702,7043,0.276921,0,2020-09-10 07:28:52.192,0.800965,-0.149668,6.650498,0.908206


In [6]:
df.to_parquet("data/final_data.parquet", index=True)