In [17]:
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath(".."))

from config import (
    INT_FILTERED_DATA_PATH,
    TRANSFORMED_DATA_PATH,
    MIN_PLAYS_PER_USER,
)

This notebook is used to investigate different train-test split strategies.  

NB! The notebook was written before including the global user split in the filtering script why the summary statistics are not accurate for the currently used data.  

In [18]:
# loading data
transformed_df = pd.read_parquet("../" + TRANSFORMED_DATA_PATH)
int_filtered_df = pd.read_parquet("../" + INT_FILTERED_DATA_PATH)

# summary statistics
n_users = len(int_filtered_df["user_id"].unique())
n_interactions = len(int_filtered_df)

#### Temporal User Split

This splitting strategy defines a training and test set of each user according to some splitting percentage, e.g., 80% training data and 20% testing data is being recommended in the literature.  

Investigating the consequences of implementing a 80-20 split implying that a user should have a total of at least 3 interactions.

In [19]:
int_filtered_df = pd.read_parquet("../" + INT_FILTERED_DATA_PATH)
n_users = len(int_filtered_df["user_id"].unique())
n_interactions = len(int_filtered_df)
# grouping by user_id and counting the number of prd_numbers for each user
df_grouped = int_filtered_df.groupby('user_id')['prd_number'].count().reset_index()
df_grouped.rename(columns={'prd_number': 'prd_count'}, inplace=True)

# number of users with at least 3 prd_numbers
df_grouped = df_grouped[df_grouped['prd_count'] >= 3]

# number of users left
users_set = set(df_grouped['user_id'])
n_users_usplit = len(users_set)

# number of interactions left (rows in the df_grouped)
df_filtered = int_filtered_df[int_filtered_df['user_id'].isin(users_set)]
n_interactions_usplit = len(df_filtered)

# printing the results
print(f"Number of users with at least 3 prd_numbers: {n_users_usplit} ({n_users_usplit/n_users:.1%} of users are kept)")
print(f"Number of interactions left: {n_interactions_usplit} ({n_interactions_usplit/n_interactions:.1%} of interactions are kept)")

Number of users with at least 3 prd_numbers: 96286 (66.8% of users are kept)
Number of interactions left: 2919846 (97.8% of interactions are kept)


#### Temporal Global Split
Defines two global timestamp that works as the boundary between the train and validation data as well as between the validation and test data for all users.

I want to make the cuts on the `date` attribute, where the selected date is a Monday. In this way entire weeks are contained in the training and test data. This might be an appropriate decision since podcast listening and publication have weekly patterns. 

Assessing the proportion between the training, validation, and test data for various split dates:

In [20]:
train_val_split_dates = ["2024-09-23 00:00:00", "2024-10-07 00:00:00", "2024-10-21 00:00:00", "2024-11-04 00:00:00"]
val_test_split_dates = ["2024-10-28 00:00:00", "2024-11-04 00:00:00", "2024-11-11 00:00:00", "2024-11-18 00:00:00"]

# data to store data
data = {"train_val_date": ["2024-09-23", "2024-10-07", "2024-10-21", "2024-11-04"],
        "val_test_date": ['2024-10-28', '2024-11-04', '2024-11-11', '2024-11-18'],
        "val_test_weeks": [5, 4, 3, 2],
        "train%": [],
        "val%": [],
        "test%": [],
        "%users": [],
        "%interactions": []}

for date1, date2 in zip(train_val_split_dates, val_test_split_dates):
    train_df = int_filtered_df[int_filtered_df["date_time"] < date1]
    val_df = int_filtered_df[(int_filtered_df["date_time"] >= date1) & (int_filtered_df["date_time"] < date2)]
    test_df = int_filtered_df[int_filtered_df["date_time"] >= date2]

    # filtering away users below threshold for number of plays per user
    grp_train_users = train_df.groupby('user_id')['prd_number'].count()
    filtered_train_df = train_df[train_df['user_id']
                                 .isin(grp_train_users[grp_train_users >= MIN_PLAYS_PER_USER].index)]
    
    # common users across the three sets
    train_users = set(filtered_train_df["user_id"])
    val_users = set(val_df["user_id"])
    test_users = set(test_df["user_id"])
    common_users = train_users.intersection(val_users, test_users)
    n_common_users = len(common_users)

    # filtering the three dataframes according to the common users
    train_df = train_df[train_df["user_id"].isin(common_users)]
    val_df = val_df[val_df["user_id"].isin(common_users)]
    test_df = test_df[test_df["user_id"].isin(common_users)]

    # number of interactions in each of the three sets
    train_interactions = len(train_df)
    val_interactions = len(val_df)
    test_interactions = len(test_df)
    total_interactions = train_interactions + val_interactions + test_interactions

    # proportions
    train_perc = train_interactions / total_interactions * 100
    val_perc = val_interactions / total_interactions * 100
    test_perc = test_interactions / total_interactions * 100
    user_perc = n_common_users / n_users * 100
    int_perc = total_interactions / n_interactions * 100

    # saving to data dict
    data["train%"].append(train_perc)
    data["val%"].append(val_perc)
    data["test%"].append(test_perc)
    data["%users"].append(user_perc)
    data["%interactions"].append(int_perc)

df = pd.DataFrame(data)
print(df)

  train_val_date val_test_date  val_test_weeks     train%       val%  \
0     2024-09-23    2024-10-28               5  24.612396  38.272929   
1     2024-10-07    2024-11-04               4  38.615077  31.152292   
2     2024-10-21    2024-11-11               3  51.918967  24.572539   
3     2024-11-04    2024-11-18               2  66.724009  16.707840   

       test%     %users  %interactions  
0  37.114675  27.823714      75.685220  
1  30.232631  30.400100      78.821276  
2  23.508494  30.552142      79.256457  
3  16.568151  28.156957      76.462709  
