In [3]:
import os
import sys

import pandas as pd

sys.path.append(os.path.abspath(".."))

from config import (
    TRANSFORMED_DATA_PATH,
)

In [9]:
# option to view entire pandas outputs 
# pd.set_option('display.max_rows', None)

# line of code to cancel the above display setting
# pd.reset_option('all')

### Main Exploratory Data Analysis (EDA)

In [7]:
# loading transformed data
transformed_df = pd.read_parquet("../" + TRANSFORMED_DATA_PATH)
transformed_df.columns

Index(['user_id', 'prd_number', 'series_title', 'unique_title', 'platform',
       'device_type', 'pub_date', 'episode_duration', 'genre',
       'branding_channel', 'mother_channel', 'category', 'content_time_spent',
       'date', 'time', 'completion_rate'],
      dtype='object')

In [8]:
# unique users
n_users = len(transformed_df["user_id"].unique())
print(f"Number of unique users: {n_users}")

# unique shows
n_shows = len(transformed_df["series_title"].unique())
print(f"Number of unique shows: {n_shows}")

# unique episodes
n_episodes = len(transformed_df["prd_number"].unique())
print(f"Number of unique episodes: {n_episodes}")

Number of unique users: 142344
Number of unique shows: 538
Number of unique episodes: 11918


### Threshold for number of plays per episode
Checking how many unique episodes and interactions will be filtered away for different thresholds.

NB! The results below cannot be reproduced since it was performed before applying the filter to `01_filter.py`. 

In [None]:
# loading filtered data
df = pd.read_parquet('../data/podcast_data_filtered.parquet')

# total number of interactions
n_interactions = len(df)

# grouping by prd_number and counting the number of appearances
prd_grp_df = df.groupby('prd_number')['user_id'].count().sort_values(ascending=True)
print(prd_grp_df.head(10))
n_episodes = len(prd_grp_df)
print(f"Number of unique episodes: {n_episodes}")

# testing different thresholds
thresholds = [5, 10, 20, 50]

episode_threshold_data = {"threshold": thresholds,
                          "episode%": [],
                          "interaction%": [],
                          }

# testing the thresholds
for threshold in thresholds:
    # filtering the DataFrame based on the threshold
    filtered_df = df[df['prd_number'].isin(prd_grp_df[prd_grp_df >= threshold].index)]

    # calculating the percentage of episodes and interactions
    episode_percentage = len(filtered_df['prd_number'].unique()) / n_episodes
    interaction_percentage = len(filtered_df) / n_interactions

    # appending the results to the data dictionary
    episode_threshold_data["episode%"].append(episode_percentage)
    episode_threshold_data["interaction%"].append(interaction_percentage)

# generating a dataframe from the gathered data on episodes
episode_threshold_df = pd.DataFrame(episode_threshold_data)
print(episode_threshold_df)

prd_number
19388840413    1
13332195418    1
14202412454    1
14202412456    1
14202410465    1
13332195424    1
13332195425    1
13332195426    1
13332195427    1
14202410481    1
Name: user_id, dtype: int64
Number of unique episodes: 22596
   threshold  episode%  interaction%
0          5  0.648212      0.995035
1         10  0.527438      0.988822
2         20  0.382767      0.973514
3         50  0.210878      0.932505


It might be reasonable to filter away episodes with less than 10 plays, as this will keep more than half of the episodes (52.7%) in the training data, while still keeping 98.9% of the interactions. 