# Imports

In [1]:
import pandas as pd
import os

# Loading the data

In [2]:
root = "data_final_project/KuaiRec 2.0/data/"
train_data = pd.read_parquet("./exports/train_data.pq")
test_data = pd.read_parquet("./exports/test_data.pq")

In [3]:
def clean_and_format(df: pd.DataFrame) -> pd.DataFrame:
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["friend_list"] = df["friend_list"].apply(lambda x: list(map(int, x)))
    ### TO REMOVE IN THE 01 STEP PLEASE MYSLEF STOP FORGETTING :
    df = df.drop(columns=["user_active_degree"])
    return df

train_data = clean_and_format(train_data)
test_data = clean_and_format(test_data)

In [4]:
# Reason to drop these here ? Not enough RAM xD
usefull_cols_for_computations = ["user_id", "video_id", "timestamp", "friend_list", "watch_ratio", "first_level_category_id"]
to_remove = [col for col in train_data.columns if col not in usefull_cols_for_computations and abs(train_data["watch_ratio"].corr(train_data[col])) < 0.1]
train_data = train_data.drop(columns=to_remove)
test_data = test_data.drop(columns=to_remove)

In [5]:
def friends_watch_ratio(df):
    exploded = df[['user_id', 'video_id', 'timestamp', 'friend_list']].explode('friend_list').rename(columns={'friend_list': 'friend_id'})
    friend_watches = df[['user_id', 'video_id', 'watch_ratio', 'timestamp']].rename(columns={'user_id': 'friend_id', 'watch_ratio': 'friend_watch_ratio', 'timestamp': 'friend_timestamp'})
    merged = exploded.merge(friend_watches, on=['friend_id', 'video_id'], how='left')
    merged = merged[merged['friend_timestamp'] <= merged['timestamp']]
    mean_ratios = merged.groupby(['user_id', 'video_id'])['friend_watch_ratio'].mean().reset_index()
    df = df.merge(mean_ratios, on=['user_id', 'video_id'], how='left')
    df['friend_watch_ratio'] = df['friend_watch_ratio'].fillna(df['video_cum_avg_watch_ratio'])
    return df

def mean_by_first_level_cat(df):
    df = df.sort_values(['user_id', 'first_level_category_id', 'timestamp'])
    df['cumulative_avg_watch_ratio'] = (
        df.groupby(['user_id', 'first_level_category_id'])['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=[0,1], drop=True)
    )
    return df

def add_video_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['video_id', 'timestamp'])
    df['video_cum_avg_watch_ratio'] = (
        df.groupby('video_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

def add_user_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['user_id', 'timestamp'])
    df['user_cum_avg_watch_ratio'] = (
        df.groupby('user_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

In [6]:
def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    df = add_user_cumulative_avg_watch_ratio(df)
    df = add_video_cumulative_avg_watch_ratio(df)
    df = mean_by_first_level_cat(df)
    df = friends_watch_ratio(df)
    df = df.drop(columns=["friend_list"])
    return df

def get_columns_to_remove(df: pd.DataFrame) -> list[str]:
    id_cols = ["user_id", "video_id"]
    to_remove = [col for col in df.columns if abs(df["watch_ratio"].corr(df[col])) < 0.1 and col not in id_cols]
    return to_remove

def keep_high_correlated_features(df: pd.DataFrame, cols_to_remove: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
    print(f"Removing columns:\n{cols_to_remove}")
    df = df.drop(columns=cols_to_remove)
    return df, df.corr()


train_data = generate_features(train_data)
test_data = generate_features(test_data)

cols_to_remove = get_columns_to_remove(train_data)

train_data, train_corr_matrix = keep_high_correlated_features(train_data, cols_to_remove)
test_data, test_corr_matrix = keep_high_correlated_features(test_data, cols_to_remove)


train_corr_matrix

Removing columns:
['timestamp', 'first_level_category_id']
Removing columns:
['timestamp', 'first_level_category_id']


Unnamed: 0,user_id,video_id,video_duration,watch_ratio,feat_12,show_cnt,valid_play_cnt,follow_cnt,cancel_follow_cnt,user_cum_avg_watch_ratio,video_cum_avg_watch_ratio,cumulative_avg_watch_ratio,friend_watch_ratio
user_id,1.0,0.000562,-0.001526,0.002031,0.00026,0.001105,0.001098,0.000458,2.1e-05,0.013342,0.002095,0.008114,0.002035
video_id,0.000562,1.0,-0.009821,0.006368,0.024002,0.029717,0.020589,0.042038,0.031149,-0.002094,0.01544,0.002968,0.014855
video_duration,-0.001526,-0.009821,1.0,-0.267192,0.232497,0.194772,0.175116,0.283237,0.329867,-0.055172,-0.589097,-0.237551,-0.569473
watch_ratio,0.002031,0.006368,-0.267192,1.0,-0.100013,-0.110675,-0.100796,-0.111318,-0.108867,0.25768,0.452108,0.420108,0.439766
feat_12,0.00026,0.024002,0.232497,-0.100013,1.0,0.044266,0.044647,0.013601,0.036248,-0.031792,-0.221864,-0.199992,-0.214593
show_cnt,0.001105,0.029717,0.194772,-0.110675,0.044266,1.0,0.984116,0.64831,0.547975,-0.032526,-0.237248,-0.088248,-0.22941
valid_play_cnt,0.001098,0.020589,0.175116,-0.100796,0.044647,0.984116,1.0,0.592763,0.504342,-0.030662,-0.214703,-0.080609,-0.207675
follow_cnt,0.000458,0.042038,0.283237,-0.111318,0.013601,0.64831,0.592763,1.0,0.605649,-0.021757,-0.243612,-0.10455,-0.235401
cancel_follow_cnt,2.1e-05,0.031149,0.329867,-0.108867,0.036248,0.547975,0.504342,0.605649,1.0,-0.021082,-0.237053,-0.100289,-0.229153
user_cum_avg_watch_ratio,0.013342,-0.002094,-0.055172,0.25768,-0.031792,-0.032526,-0.030662,-0.021757,-0.021082,1.0,0.087147,0.640139,0.09007


In [7]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [8]:
export_dir = "./exports"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
train_data.to_parquet(export_dir + "/train_data_feature_engineered.pq")
test_data.to_parquet(export_dir + "/test_data_feature_engineered.pq")
train_data

Unnamed: 0,user_id,video_id,video_duration,watch_ratio,feat_12,show_cnt,valid_play_cnt,follow_cnt,cancel_follow_cnt,user_cum_avg_watch_ratio,video_cum_avg_watch_ratio,cumulative_avg_watch_ratio,friend_watch_ratio
0,0,6789,13267,0.175398,0,0.0,0.0,0.0,0.0,0.648667,0.615807,0.175398,0.615807
1,0,3657,10734,0.078070,0,0.0,0.0,0.0,0.0,0.958032,0.754661,0.126734,0.754661
2,0,9591,11312,0.077263,0,0.0,0.0,0.0,0.0,0.895423,0.686390,0.110243,0.686390
3,0,5233,10751,1.340061,0,0.0,0.0,0.0,0.0,0.950207,0.749486,0.417698,0.749486
4,0,6843,7700,0.110519,0,0.0,0.0,0.0,0.0,0.924762,0.383420,0.356262,0.383420
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9784939,7175,4518,13351,0.400494,0,19846697.0,16002015.0,38513.0,3.0,0.789764,0.673301,0.319926,0.673301
9784940,7175,9775,52440,1.167086,0,430171.0,302905.0,1483.0,0.0,0.806899,0.190515,1.167086,0.190515
9784941,7175,883,131684,0.029183,0,0.0,0.0,0.0,0.0,0.812518,0.100969,0.598135,0.100969
9784942,7175,3929,9443,0.429313,0,642530.0,407165.0,1189.0,0.0,0.809559,0.892350,0.541861,0.892350
