## Import necessary libraries

In [1]:
# Data imports
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
import ast
import re

SEED = 42
warnings.filterwarnings("ignore")
np.random.seed(SEED)

# Load Data

In [2]:
small_matrix_cleaned = pd.read_parquet("data/small_matrix_cleaned.parquet")
user_features_cleaned = pd.read_parquet("data/user_features.parquet")

# Data preprocess for AI

In [3]:
data_avg = small_matrix_cleaned.copy()

data_avg['time'] = pd.to_datetime(data_avg['time'])

data_avg['hour'] = data_avg['time'].dt.hour

data_avg = data_avg.groupby("user_id").mean(numeric_only=True)
data_avg = data_avg.reset_index()
data_avg = data_avg.drop(columns=["video_id", "like"])
data_avg = data_avg.rename(columns={"watch_ratio": "watch_ratio_mean", "video_length": "video_length_mean"})

data_for_ai = small_matrix_cleaned[["user_id", "watch_ratio", "video_length", "time"]]
data_for_ai['time'] = pd.to_datetime(data_for_ai['time'])
data_for_ai = data_for_ai.merge(data_avg, on="user_id", how="left")

data_for_ai

Unnamed: 0,user_id,watch_ratio,video_length,time,watch_ratio_mean,video_length_mean,hour
0,14,0.722103,-1,2020-07-05 05:27:48.378,0.939212,-0.056158,9.575698
1,14,1.907377,-1,2020-07-05 05:28:00.057,0.939212,-0.056158,9.575698
2,14,2.063311,0,2020-07-05 05:29:09.479,0.939212,-0.056158,9.575698
3,14,0.566388,0,2020-07-05 05:30:43.285,0.939212,-0.056158,9.575698
4,14,0.418364,0,2020-07-05 05:35:43.459,0.939212,-0.056158,9.575698
...,...,...,...,...,...,...,...
4083324,7162,0.142857,1,2020-09-01 20:06:35.984,1.087398,-0.067249,7.822461
4083325,7162,1.234848,0,2020-09-02 14:44:51.342,1.087398,-0.067249,7.822461
4083326,7162,1.024412,1,2020-09-03 08:45:01.474,1.087398,-0.067249,7.822461
4083327,7162,0.273750,0,2020-09-04 22:56:32.021,1.087398,-0.067249,7.822461


In [4]:
def range_to_mean(val):
    if '500+' in val:
        return 500  # or a higher estimate if preferred
    match = re.match(r'\((\d+),(\d+)\]', val)
    if match:
        low, high = map(int, match.groups())
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

def range_to_mean_bis(val):
    if '0' in val:
        return 0  # or a higher estimate if preferred
    match = re.match(r'\[(\d+),(\d+)\)', val)
    if match:
        low, high = map(int, match.groups())
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

def days_to_mean(val):
    if '500+' in val:
        return 500  # or a higher estimate if preferred
    match = re.match(r'(\d+)-(\d+)', val)
    if match:
        low, high = map(int, match.groups())    
        return (low + high) / 2
    return np.nan  # fallback in case of unexpected format

In [5]:
final_data = data_for_ai.copy()
final_data = final_data.merge(user_features_cleaned, on="user_id", how="left")
#final_data = final_data.merge(videos_pca_train, on="video_id", how="left")
final_data = final_data.drop(columns=["user_id"])
final_data['user_active_degree'] = pd.factorize(final_data['user_active_degree'])[0]

final_data['follow_user_num_range'] = final_data['follow_user_num_range'].apply(range_to_mean)
final_data['fans_user_num_range'] = final_data['fans_user_num_range'].apply(range_to_mean_bis)
final_data['friend_user_num_range'] = final_data['friend_user_num_range'].apply(range_to_mean_bis)
final_data['register_days_range'] = final_data['register_days_range'].apply(days_to_mean)
final_data = final_data.fillna(0)

## Save Data for AI

In [6]:
final_data.to_parquet("data/data.parquet", index=True)