# Imports

In [None]:
import pandas as pd
import os

# Loading the data

In [None]:
root = "data_final_project/KuaiRec 2.0/data/"
train_data = pd.read_parquet("./exports/train_data.pq")
test_data = pd.read_parquet("./exports/test_data.pq")

In [None]:
def clean_and_format(df: pd.DataFrame) -> pd.DataFrame:
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df["friend_list"] = df["friend_list"].apply(lambda x: list(map(int, x)))
    ### TO REMOVE IN THE 01 STEP PLEASE MYSLEF STOP FORGETTING :
    df = df.drop(columns=["user_active_degree"])
    return df

train_data = clean_and_format(train_data)
test_data = clean_and_format(test_data)

In [None]:
# Reason to drop these here ? Not enough RAM xD
usefull_cols_for_computations = ["user_id", "video_id", "timestamp", "friend_list", "watch_ratio", "first_level_category_id"]
to_remove = [col for col in train_data.columns if col not in usefull_cols_for_computations and abs(train_data["watch_ratio"].corr(train_data[col])) < 0.1]
train_data = train_data.drop(columns=to_remove)
test_data = test_data.drop(columns=to_remove)

In [None]:
def friends_watch_ratio(df):
    exploded = df[['user_id', 'video_id', 'timestamp', 'friend_list']].explode('friend_list').rename(columns={'friend_list': 'friend_id'})
    friend_watches = df[['user_id', 'video_id', 'watch_ratio', 'timestamp']].rename(columns={'user_id': 'friend_id', 'watch_ratio': 'friend_watch_ratio', 'timestamp': 'friend_timestamp'})
    merged = exploded.merge(friend_watches, on=['friend_id', 'video_id'], how='left')
    merged = merged[merged['friend_timestamp'] <= merged['timestamp']]
    mean_ratios = merged.groupby(['user_id', 'video_id'])['friend_watch_ratio'].mean().reset_index()
    df = df.merge(mean_ratios, on=['user_id', 'video_id'], how='left')
    df['friend_watch_ratio'] = df['friend_watch_ratio'].fillna(df['video_cum_avg_watch_ratio'])
    return df

def mean_by_first_level_cat(df):
    df = df.sort_values(['user_id', 'first_level_category_id', 'timestamp'])
    df['cumulative_avg_watch_ratio'] = (
        df.groupby(['user_id', 'first_level_category_id'])['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=[0,1], drop=True)
    )
    return df

def add_video_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['video_id', 'timestamp'])
    df['video_cum_avg_watch_ratio'] = (
        df.groupby('video_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

def add_user_cumulative_avg_watch_ratio(df):
    df = df.sort_values(['user_id', 'timestamp'])
    df['user_cum_avg_watch_ratio'] = (
        df.groupby('user_id')['watch_ratio']
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )
    return df

In [None]:
def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    df = add_user_cumulative_avg_watch_ratio(df)
    df = add_video_cumulative_avg_watch_ratio(df)
    df = mean_by_first_level_cat(df)
    df = friends_watch_ratio(df)
    df = df.drop(columns=["friend_list"])
    return df

def get_columns_to_remove(df: pd.DataFrame) -> list[str]:
    to_remove = [col for col in df.columns if abs(df["watch_ratio"].corr(df[col])) < 0.1 and col != "user_id"]
    return to_remove

def keep_high_correlated_features(df: pd.DataFrame, cols_to_remove: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
    print(f"Removing columns:\n{cols_to_remove}")
    df = df.drop(columns=cols_to_remove)
    return df, df.corr()


train_data = generate_features(train_data)
test_data = generate_features(test_data)

cols_to_remove = get_columns_to_remove(train_data)

train_data, train_corr_matrix = keep_high_correlated_features(train_data, cols_to_remove)
test_data, test_corr_matrix = keep_high_correlated_features(test_data, cols_to_remove)


train_corr_matrix

In [None]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [None]:
export_dir = "./exports"
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
train_data.to_parquet(export_dir + "/train_data_feature_engineered.pq")
test_data.to_parquet(export_dir + "/test_data_feature_engineered.pq")