In [29]:
import pandas as pd
from scipy.sparse import coo_matrix

MIN_PLAYS_PER_EPISODE = 10
SPLIT_DATE = "2024-11-11"
MIN_PLAYS_PER_USER = 2
TRANSFORMED_DATA_PATH = "..\data\podcast_data_transformed.parquet"
COLUMNS_TO_KEEP = ["user_id", "prd_number", "completion_rate"]

In [22]:
# loading the transformed data
transformed_df = pd.read_parquet(TRANSFORMED_DATA_PATH)

# grouping by prd_number and counting the number of plays for each episode
prd_grp_df = transformed_df.groupby('prd_number')['user_id'].count()

# filtering away episodes below threshold fo number of plays per episode
filtered_df = transformed_df[transformed_df['prd_number'].isin(prd_grp_df[prd_grp_df >= MIN_PLAYS_PER_EPISODE].index)]

# applying the global user split
int_train_df = filtered_df[filtered_df['date'] < SPLIT_DATE]
int_test_df = filtered_df[filtered_df['date'] >= SPLIT_DATE]

# number of unique users both in the intermediary train and test data
common_users = set(int_train_df['user_id']).intersection(set(int_test_df['user_id']))

# filter df according to the common users
train_df_common = int_train_df[int_train_df['user_id'].isin(common_users)]
test_df_common = int_test_df[int_test_df['user_id'].isin(common_users)]

# grouping by user_id and counting the number of prd_numbers for each user in the train data
df_grouped_train = train_df_common.groupby('user_id')['prd_number'].count()

# filtering away users below threshold for number of plays per user
train_df = train_df_common[train_df_common['user_id'].isin(df_grouped_train[df_grouped_train >= MIN_PLAYS_PER_USER].index)]
test_df = test_df_common[test_df_common['user_id'].isin(df_grouped_train[df_grouped_train >= MIN_PLAYS_PER_USER].index)]

# only keeping relevant columns
train_df = train_df[COLUMNS_TO_KEEP]
test_df = test_df[COLUMNS_TO_KEEP]

In [None]:
print(train_df)

: 

In [7]:
print(test_df)

                                                   user_id   prd_number  \
27       00018423377c14104870be7deda65daaa9dbc7b97f89a8...  11032421463   
28       00018423377c14104870be7deda65daaa9dbc7b97f89a8...  11042408466   
29       00018423377c14104870be7deda65daaa9dbc7b97f89a8...  11162220516   
36       00018423377c14104870be7deda65daaa9dbc7b97f89a8...  11802401187   
82       00051e4d15267dd420ebac00a1382db78d74e027c55d7f...  11032462201   
...                                                    ...          ...   
2984815  fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...  13332496441   
2984841  fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...  13652450213   
2984851  fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...  15452448047   
2984857  fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...  16122413462   
2984859  fffe6e1be686be369dfe0943282e377f7d1a51c0eae35a...  16122415486   

                       series_title  \
27                 Brinkmanns briks   
28              Ditle

In [24]:
# load train data
train_df = pd.read_parquet("..\data\podcast_data_train.parquet")

# Create the matrix
user_item_matrix = train_df.pivot(index='user_id', columns='prd_number', values='completion_rate')

# Fill NaN with 0 for missing user-item pairs
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix.shape

(56969, 10631)

In [30]:
matrix_values = user_item_matrix.values
sparse_matrix = coo_matrix(matrix_values)
sparse_matrix

<COOrdinate sparse matrix of dtype 'float64'
	with 1889646 stored elements and shape (56969, 10631)>

In [None]:
print("test")

In [None]:
from lightfm import LightFM
import os
import sys
sys.path.append(os.path.abspath(".."))

import utils

: 

In [None]:
# Set the number of threads; you can increase this
# if you have more physical cores available.
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6
TRAIN_DATA_PATH = "..\data\podcast_data_train.parquet"
train_df = pd.read_parquet(TRAIN_DATA_PATH)

# preparing the interaction matrix
train = utils.prep_interaction_matrix(
    df=train_df,
    user_col="user_id",
    item_col="prd_number",
    rating_col="completion_rate",
)

# Let's fit a WARP model: these generally have the best performance.
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Run 3 epochs and time it.
%time model = model.fit(interactions=train, epochs=NUM_EPOCHS)

: 