In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [None]:
import requests
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import re
import feedparser
import time
from nltk import sent_tokenize
import itertools
import string
import html_text
import pickle

In [None]:
import sys
sys.path.append('utils')
from preprocess import *

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 300

## Data schema

Schema for `podcasts` dataframe:

| column                      | type   | description                                            |
|-----------------------------|--------|--------------------------------------------------------|
| podcast_id                  | string | Podcast ID                                             |
| im_name_label               | string | Podcast name                                           |
| im_artist_label             | string | Author name                                            |
| category_attributes_term    | string | Category                                               |
| link_attributes_href        | string | iTunes link                                            |
| country                     | string | Country code                                           |
| country_fullname            | string | Country full name                                      |
| feedurl                     | string | Feed URL                                               |
| artwork                     | string | Artwork URl                                            |
| summary_label               | list   | Summary in the original language, split into sentences |
| summary_label_en            | list   | Summary in English, split into sentences               |
| summary_label_en_cleaned    | list   | Cleaned English summary                                |
| summary_episodes_en_cleaned | list   | Tokenized summary and episodes (using spaCy)           |

Schema for `episodes` dataframe:

| column                     | type   | description                                                |
|----------------------------|--------|------------------------------------------------------------|
| episode_id                 | string | Episode ID (the concatenation of date and link)            |
| date                       | string |                                                            |
| link                       | string |                                                            |
| title                      | list   | Title in the original language, split into sentences       |
| title_en                   | list   | Title in English, split into sentences                     |
| title_en_cleaned           | list   | Cleaned English title                                      |
| summary                    | list   | Summary in the original language, split into sentences     |
| summary_en                 | list   | Summary in English, split into sentences                   |
| summary_en_cleaned         | list   | Cleaned English summary                                    |
| summary_en_cleaned_deduped | list   | Cleaned English summary with duplicated sentences removed |

## Pull top podcasts from iTunes

In [None]:
def pull_top_podcasts(country, genre_id, limit=50):
    url = 'https://itunes.apple.com/{}/rss/topaudiopodcasts/genre={}/limit={}/explicit=true/json'.format(country, genre_id, limit*2)
    podcasts = requests.get(url).json()
    
    # Read json into a dataframe
    podcasts = json_normalize(podcasts['feed']['entry'])
    podcasts.columns = [re.sub('[^a-z0-9]', '_', col.lower()) for col in podcasts]
    podcasts = podcasts[['id_attributes_im_id', 'im_name_label', 'im_artist_label', 'summary_label', 'category_attributes_term', 'link_attributes_href']]
    podcasts = podcasts[podcasts['category_attributes_term'].isin(['Society & Culture', 'Personal Journals'])]
    podcasts.rename(columns={'id_attributes_im_id': 'podcast_id'}, inplace=True)
    podcasts['country'] = country
    podcasts.dropna(inplace=True)
    
    return podcasts.head(limit)

In [None]:
podcasts = []
for country in countries:
    for genre_id in genre_ids:
        podcasts.append(pull_top_podcasts(country, genre_id))

In [None]:
podcasts = pd.concat(podcasts)
podcasts.reset_index(drop=True, inplace=True)
podcasts.shape

In [None]:
# Drop podcasts with no summary
podcasts = podcasts[podcasts['summary_label'].str.strip().str.len() > 1].copy()
podcasts.shape

In [None]:
# Add country full names
podcasts['country_fullname'] = podcasts['country'].map(countries)
podcasts['country_fullname'].value_counts()

## Look up `feedUrl`

In [None]:
# Extract `feedUrl`
def extract_feedurl(podcast_id):
    url = 'https://itunes.apple.com/lookup?id=' + podcast_id
    podcast = requests.get(url).json()
    try:
        feedurl = podcast['results'][0]['feedUrl']
        artwork_keys = [key for key in podcast['results'][0] if key.startswith('artworkUrl')]
        artwork = podcast['results'][0][artwork_keys[-1]]
        return feedurl, artwork
    except:
        return None, None

In [None]:
podcasts['feedurl'], podcasts['artwork'] = zip(*podcasts['podcast_id'].apply(extract_feedurl))

In [None]:
# Drop podcasts with no `feedUrl`
podcasts.dropna(inplace=True)
podcasts.reset_index(drop=True, inplace=True)
podcasts.shape

In [None]:
# Create mapping from podcast ID to `feedUrl` for easy processing later
# Note there are duplicate podcasts appearing in multiple countries (which is fine)
podcast_id_to_feedurl = dict(zip(podcasts['podcast_id'], podcasts['feedurl']))
len(podcast_id_to_feedurl)

## Pull episodes from feeds

In [None]:
def parse_feed(url, n_episodes=40):
    feed = feedparser.parse(url)
    if feed['entries']:
        episodes = []

        # Define fields to pull
        field_sources = {
            'date': ['published'],
            'title': ['title'],
            'summary': ['content', 'summary_detail'],
            'link': ['links']
        }
        for episode in feed['entries']:
            fields = {}
            for field, sources in field_sources.items():
                for source in sources:
                    if source in episode and len(episode[source]) > 0:
                        if source == 'content':
                            fields[field] = episode[source][0]['value']
                        elif source == 'summary_detail':
                            if field not in fields:
                                fields[field] = episode[source]['value']
                        elif source == 'links':
                            for subsource in episode[source]:
                                if 'href' in subsource and 'type' in subsource:
                                    if 'audio' in subsource['type']:
                                        fields[field] = subsource['href']
                        else:
                            fields[field] = episode[source]

            # Do not add an episode if any of the needed attributes is missing
            if len(fields) == len(field_sources):
                episodes.append(fields)

        if episodes:
            # Concatenate into a dataframe
            episodes = pd.DataFrame(episodes)

            # Format dates
            episodes['date'] = pd.to_datetime(episodes['date'], utc=True)

            # Sort by date and pick the most recent N episodes
            episodes = episodes.sort_values('date', ascending=False).head(n_episodes)
            episodes.reset_index(drop=True, inplace=True)

            return episodes
    return None

In [None]:
podcast_id_to_episodes = {}
for podcast_id, feedurl in podcast_id_to_feedurl.items():
    episodes = None
    n_attempts = 3
    while episodes is None and n_attempts > 0:
        if n_attempts < 3:
            time.sleep(5)
        episodes = parse_feed(feedurl)
        n_attempts -= 1
    if episodes is not None:
        podcast_id_to_episodes[podcast_id] = episodes

In [None]:
len(podcast_id_to_episodes)

In [None]:
# Check podcasts that are excluded because of missing fields
for podcast_id in podcast_id_to_feedurl:
    if podcast_id not in podcast_id_to_episodes:
        podcast_name = podcasts[podcasts['podcast_id'] == podcast_id]['im_name_label'].values[0]
        print(podcast_id, podcast_name, podcast_id_to_feedurl[podcast_id])

In [None]:
# Remove these podcasts
podcasts = podcasts[podcasts['podcast_id'].isin(podcast_id_to_episodes)].copy()
podcasts.shape

In [None]:
# Save intermediary data
podcast_data = {
    'podcasts': podcasts,
    'podcast_id_to_episodes': podcast_id_to_episodes
}

pickle.dump(podcast_data, open('data/podcast_data.pkl', 'wb'))

## Clean text data

In [None]:
podcast_data = pickle.load(open('data/podcast_data.pkl', 'rb'))
podcasts, podcast_id_to_episodes = podcast_data['podcasts'], podcast_data['podcast_id_to_episodes']
podcasts.shape, len(podcast_id_to_episodes)

In [None]:
# Clean podcast summaries
podcasts['summary_label'] = podcasts['summary_label'].apply(clean_text, normalize_cn_punct=True, normalize_url=True)

In [None]:
# Clean podcast episodes
for episodes in podcast_id_to_episodes.values():
    episodes['title'] = episodes['title'].apply(clean_text, normalize_cn_punct=True, normalize_url=True)
    episodes['summary'] = episodes['summary'].apply(clean_text, normalize_cn_punct=True, normalize_url=True)

## Segment sentences

In [None]:
# Segment sentences for podcast summaries
podcasts['summary_label'] = podcasts['summary_label'].apply(tokenize_sents)

In [None]:
# Segment sentences for episodes
for episodes in podcast_id_to_episodes.values():
    episodes['title'] = episodes['title'].apply(tokenize_sents)
    episodes['summary'] = remove_duplicate_summaries(episodes['summary'].tolist(), episodes['title'].tolist())

## Remove podcasts with too little description

In [None]:
# Count the average number of characters per episodes
def count_characters(episodes):
    if len(episodes) > 0:
        n_chars_title = episodes['title'].str.join(' ').str.len().sum()
        n_chars_summary = episodes['summary'].str.join(' ').str.len().sum()
        return (n_chars_title + n_chars_summary) / len(episodes)
    return 0

In [None]:
n_chars_per_podcast = [count_characters(episodes) for episodes in podcast_id_to_episodes.values()]
n_chars_per_podcast = pd.Series(n_chars_per_podcast, index=podcast_id_to_episodes.keys())
n_chars_per_podcast.quantile(np.arange(0, 1, .1))

In [None]:
# Remove podcasts with less than a certain number of characters
n_chars_cutoff = 100
podcast_id_to_keep = n_chars_per_podcast[n_chars_per_podcast >= n_chars_cutoff].index
len(podcast_id_to_keep)

In [None]:
podcasts = podcasts[podcasts['podcast_id'].isin(podcast_id_to_keep)].copy()
podcasts.reset_index(drop=True, inplace=True)
podcasts.shape

In [None]:
podcasts['country_fullname'].value_counts()

In [None]:
podcast_id_to_episodes = {podcast_id: podcast_id_to_episodes[podcast_id] for podcast_id in podcast_id_to_episodes if podcast_id in podcast_id_to_keep}
len(podcast_id_to_episodes)

## Create `episode_id`

In [None]:
for episodes in podcast_id_to_episodes.values():
    episodes['episode_id'] = episodes['date'].dt.date.astype(str) + '-' + episodes['title'].str.join('')

In [None]:
!cp data/podcast_data_proc.pkl data/podcast_data_proc-OLD.pkl

In [None]:
# Save
podcast_data = {
    'podcasts': podcasts,
    'podcast_id_to_episodes': podcast_id_to_episodes
}

pickle.dump(podcast_data, open('data/podcast_data_proc.pkl', 'wb'))