## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
import warnings
import re


SEED = 42
TRAIN_TEST_SPLIT = 0.8
CORR_THRESHOLD = 0.2

warnings.filterwarnings("ignore")
np.random.seed(SEED)

# Load Data

In [2]:
df = pd.read_parquet("data/final_data.parquet")
user_features = pd.read_parquet("data/user_features.parquet")

# Data preprocess for AI

In [3]:
def range_to_mean(val):
    if '500+' in val:
        return 500  # or a higher estimate if preferred
    match = re.match(r'\((\d+),(\d+)\]', val)
    if match:
        low, high = map(int, match.groups())
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

def range_to_mean_bis(val):
    if '0' in val:
        return 0  # or a higher estimate if preferred
    match = re.match(r'\[(\d+),(\d+)\)', val)
    if match:
        low, high = map(int, match.groups())
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

def days_to_mean(val):
    if '500+' in val:
        return 500  # or a higher estimate if preferred
    match = re.match(r'(\d+)-(\d+)', val)
    if match:
        low, high = map(int, match.groups())    
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

In [4]:
final_data = df.copy()
final_data = final_data.merge(user_features, on="user_id", how="left")
#final_data = final_data.merge(videos_pca_train, on="video_id", how="left")
#final_data = final_data.drop(columns=["user_id", "video_id"])
final_data['user_active_degree'] = pd.factorize(final_data['user_active_degree'])[0]

final_data['follow_user_num_range'] = final_data['follow_user_num_range'].apply(range_to_mean)
final_data['fans_user_num_range'] = final_data['fans_user_num_range'].apply(range_to_mean_bis)
final_data['friend_user_num_range'] = final_data['friend_user_num_range'].apply(range_to_mean_bis)
final_data['register_days_range'] = final_data['register_days_range'].apply(days_to_mean)
final_data = final_data.fillna(0)

In [5]:
final_data.to_parquet("data/merged.parquet", index=True)