## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
import warnings
import re


SEED = 42
TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

warnings.filterwarnings("ignore")
np.random.seed(SEED)

# Load Data

In [2]:
small_matrix_cleaned = pd.read_parquet("data/small_matrix_cleaned.parquet")
user_features = pd.read_parquet("data/user_features.parquet")
social_network_cleaned = pd.read_parquet("data/social_network_cleaned.parquet")

# Data preprocess for AI

In [3]:
data_avg = small_matrix_cleaned.copy()

data_avg['time'] = pd.to_datetime(data_avg['time'])

data_avg['hour'] = data_avg['time'].dt.hour

data_avg = data_avg.groupby("user_id").mean(numeric_only=True)
data_avg = data_avg.reset_index()
data_avg = data_avg.drop(columns=["video_id", "like"])
data_avg = data_avg.rename(columns={"watch_ratio": "watch_ratio_mean", "video_length": "video_length_mean"})

data_for_ai = small_matrix_cleaned[["user_id", "video_id", "watch_ratio", "video_length", "time"]]
data_for_ai['time'] = pd.to_datetime(data_for_ai['time'])
data_for_ai = data_for_ai.merge(data_avg, on="user_id", how="left")
data_for_ai = data_for_ai.merge(social_network_cleaned, on="user_id", how="left")
data_for_ai["friend_list"] = data_for_ai["friend_list"].apply(
    lambda x: [] if isinstance(x, float) and np.isnan(x) else x
)

data_for_ai

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,friend_list
0,14,148,0.722103,-1,2020-07-05 05:27:48.378,0.969348,-0.101666,9.684247,[]
1,14,183,1.907377,-1,2020-07-05 05:28:00.057,0.969348,-0.101666,9.684247,[]
2,14,3649,2.063311,0,2020-07-05 05:29:09.479,0.969348,-0.101666,9.684247,[]
3,14,5262,0.566388,0,2020-07-05 05:30:43.285,0.969348,-0.101666,9.684247,[]
4,14,8234,0.418364,0,2020-07-05 05:35:43.459,0.969348,-0.101666,9.684247,[]
...,...,...,...,...,...,...,...,...,...
3857008,7162,9177,0.142857,1,2020-09-01 20:06:35.984,1.123807,-0.114988,7.869349,[]
3857009,7162,4987,1.234848,0,2020-09-02 14:44:51.342,1.123807,-0.114988,7.869349,[]
3857010,7162,7988,1.024412,1,2020-09-03 08:45:01.474,1.123807,-0.114988,7.869349,[]
3857011,7162,6533,0.273750,0,2020-09-04 22:56:32.021,1.123807,-0.114988,7.869349,[]


In [6]:
df = data_for_ai

# Step 1: Sort by time
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values('time').reset_index(drop=True)

# Step 2: Compute prior video mean (exclude current row using shift)
df['video_watch_ratio_prior_mean'] = (
    df
    .groupby('video_id')['watch_ratio']
    .transform(lambda x: x.shift().expanding().mean())
)

# Step 3: Explode friends
df_exploded = df.explode('friend_list').rename(columns={'friend_list': 'friend_id'})

# Step 4: Join back with historical data for that friend + video (prior to time)
friend_history = df[['user_id', 'video_id', 'time', 'watch_ratio']].copy()
friend_history.columns = ['friend_id', 'video_id', 'friend_time', 'friend_watch_ratio']

# Step 5: Merge and filter only past friend data
merged = df_exploded.merge(friend_history, on=['friend_id', 'video_id'], how='left')
merged = merged[merged['friend_time'] < merged['time']]

# Step 6: Compute friend mean
friend_means = (
    merged
    .groupby(['user_id', 'video_id', 'time'])['friend_watch_ratio']
    .mean()
    .reset_index()
    .rename(columns={'friend_watch_ratio': 'friend_watch_ratio_prior_mean'})
)

# Step 7: Merge back friend means and fallback to video means
df = df.merge(friend_means, on=['user_id', 'video_id', 'time'], how='left')
df['watch_ratio_prior_mean'] = df['friend_watch_ratio_prior_mean'].fillna(df['video_watch_ratio_prior_mean'])

# Optional cleanup
df = df.drop(columns=['friend_watch_ratio_prior_mean', 'video_watch_ratio_prior_mean'])


In [None]:
df

Unnamed: 0,user_id,video_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour,friend_list,watch_ratio_prior_mean
0,6190,9559,1.981442,0,2020-07-04 02:23:26.060,1.037910,-0.104526,9.770474,[],
1,6190,9553,0.964642,0,2020-07-04 04:03:18.888,1.037910,-0.104526,9.770474,[],
2,6190,9530,0.112044,0,2020-07-04 04:03:53.725,1.037910,-0.104526,9.770474,[],
3,6190,8176,0.700000,0,2020-07-04 04:35:24.528,1.037910,-0.104526,9.770474,[],
4,6190,8189,0.906852,0,2020-07-04 06:32:23.949,1.037910,-0.104526,9.770474,[],
...,...,...,...,...,...,...,...,...,...,...
3857008,4766,10291,0.171522,1,2020-09-05 23:52:40.419,0.937582,-0.098420,11.584411,[],0.273859
3857009,6682,3151,0.612817,0,2020-09-05 23:52:56.230,0.759180,-0.105393,9.305252,[],0.892070
3857010,6139,9112,0.451241,0,2020-09-05 23:53:50.831,0.724104,-0.104069,8.974073,[],0.826597
3857011,5450,9162,0.871151,0,2020-09-05 23:57:15.282,0.711956,-0.103424,12.812369,[],0.989261


In [7]:
def range_to_mean(val):
    if '500+' in val:
        return 500  # or a higher estimate if preferred
    match = re.match(r'\((\d+),(\d+)\]', val)
    if match:
        low, high = map(int, match.groups())
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

def range_to_mean_bis(val):
    if '0' in val:
        return 0  # or a higher estimate if preferred
    match = re.match(r'\[(\d+),(\d+)\)', val)
    if match:
        low, high = map(int, match.groups())
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

def days_to_mean(val):
    if '500+' in val:
        return 500  # or a higher estimate if preferred
    match = re.match(r'(\d+)-(\d+)', val)
    if match:
        low, high = map(int, match.groups())    
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

In [None]:
final_data = df.copy()
final_data = final_data.merge(user_features, on="user_id", how="left")
#final_data = final_data.merge(videos_pca_train, on="video_id", how="left")
final_data = final_data.drop(columns=["user_id", "video_id"])
final_data['user_active_degree'] = pd.factorize(final_data['user_active_degree'])[0]

final_data['follow_user_num_range'] = final_data['follow_user_num_range'].apply(range_to_mean)
final_data['fans_user_num_range'] = final_data['fans_user_num_range'].apply(range_to_mean_bis)
final_data['friend_user_num_range'] = final_data['friend_user_num_range'].apply(range_to_mean_bis)
final_data['register_days_range'] = final_data['register_days_range'].apply(days_to_mean)
final_data = final_data.fillna(0)

## Only keep meaningful features

In [9]:
corr = final_data.corr()["watch_ratio"]
corr

watch_ratio               1.000000
video_length             -0.446454
time                     -0.007629
watch_ratio_mean          0.242036
video_length_mean        -0.024745
hour                      0.002531
watch_ratio_prior_mean    0.515066
user_active_degree       -0.009268
is_lowactive_period            NaN
is_live_streamer         -0.010062
is_video_author          -0.011930
follow_user_num          -0.002115
follow_user_num_range     0.003589
fans_user_num            -0.004117
fans_user_num_range            NaN
friend_user_num           0.000129
friend_user_num_range     0.006425
register_days            -0.005658
register_days_range      -0.002350
onehot_feat0             -0.000525
onehot_feat1              0.003763
onehot_feat2              0.009550
onehot_feat3              0.008513
onehot_feat4             -0.009046
onehot_feat5              0.005654
onehot_feat6             -0.017054
onehot_feat7              0.011141
onehot_feat8              0.021214
onehot_feat9        

In [10]:
corr[corr > CORR_THRESHOLD]

watch_ratio               1.000000
watch_ratio_mean          0.242036
watch_ratio_prior_mean    0.515066
Name: watch_ratio, dtype: float64

In [11]:
final_data = final_data[["time"] + corr[corr > CORR_THRESHOLD].index.to_list()]

## Save Data for AI

In [12]:
final_data.to_parquet("data/data.parquet", index=True)