In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [None]:
import numpy as np
import pandas as pd
import pickle
import itertools
import spacy
from collections import Counter, defaultdict
import io
from sklearn.preprocessing import normalize
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly
import plotly.graph_objs as go
from IPython.display import IFrame 

In [None]:
import sys
sys.path.append('utils')
from preprocess import *

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 300

## Load tokenized data

In [None]:
podcast_data = pickle.load(open('data/podcast_data_translated.pkl', 'rb'))
podcasts, podcast_id_to_episodes = podcast_data['podcasts'], podcast_data['podcast_id_to_episodes']
podcasts.shape, len(podcast_id_to_episodes)

In [None]:
# Extract tokens
def extract_tokens(sent, ents_to_rm=None, rm_stop=False, rm_punct=False):
    if ents_to_rm:
        sent = [token for token in sent if token.ent_type_ not in ents_to_rm]
    if rm_stop:
        sent = [token for token in sent if not token.is_stop]
    if rm_punct:
        sent = [token for token in sent if not token.is_punct]
    return [token.text.lower() for token in sent]

In [None]:
podcast_tokens = podcasts['summary_episodes_en_cleaned'].apply(lambda sents: [extract_tokens(sent, ents_to_rm=['GPE', 'NORP', 'PERSON'], rm_stop=True, rm_punct=True) for sent in sents]).tolist()

In [None]:
# Concatenate tokens from the same podcast
podcast_tokens = [list(itertools.chain(*podcast)) for podcast in podcast_tokens]
len(podcast_tokens)

## Create vocabulary

In [None]:
def create_vocab_mapper(toks, max_vocab=100000, min_freq=2, UNK='_unk_'):
    toks_freq = Counter(toks)    
    itos = [s for s, c in toks_freq.most_common(max_vocab) if c >= min_freq]
    
    if UNK:
        itos.insert(0, UNK)  # Note the index for UNK is 0
        stoi = defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)})
    else:
        stoi = {v: k for k, v in enumerate(itos)}
    
    return stoi, itos

In [None]:
# Concatenate all the tokens
podcast_tokens_combined = list(itertools.chain(*podcast_tokens))
len(podcast_tokens_combined)

In [None]:
stoi, itos = create_vocab_mapper(podcast_tokens_combined)
len(stoi)

In [None]:
itos[:10]

## Map to pretrained word embeddings

In [None]:
# Process fasttext embeddings
# https://fasttext.cc/docs/en/english-vectors.html
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data

In [None]:
pretrained_word_embed = load_vectors('data/wiki-news-300d-1M-subword.vec')

In [None]:
# Map each word to the pre-trained word vectors
itoe = np.zeros((len(itos), 300))
not_found = []

for i, token in enumerate(itos):
    if token in pretrained_word_embed:
        itoe[i] = np.array(pretrained_word_embed[token])
    else:
        not_found.append(token)

print('% of tokens not found in the pretrained embeddings: {}'.format(len(not_found) / len(itoe)))

In [None]:
not_found[:10]

In [None]:
# Normalize
itoe = normalize(itoe)
itoe.shape

## Compute podcast embeddings

In [None]:
def compute_podcast_embeddings(podcast, stoi, itoe):
    # Map to indices
    podcast_ix = [stoi[token] for token in podcast]

    # Map to word embeddings
    podcast_embeddings = [itoe[ix] for ix in podcast_ix]
    
    # Compute average over all word embeddings
    return np.array(podcast_embeddings).mean(axis=0)

In [None]:
podcast_embeddings = [compute_podcast_embeddings(podcast, stoi, itoe) for podcast in podcast_tokens]
podcast_embeddings = np.array(podcast_embeddings)
podcast_embeddings.shape

In [None]:
# Normalize
podcast_embeddings = normalize(podcast_embeddings)
podcast_embeddings.shape

## Visualize embeddings

In [None]:
def reduce_embed_to_2d(embeddings, algorithm='tsne'):
    if embeddings.shape[1] > 2:
        if algorithm == 'pca':
            embeddings = PCA(n_components=2, random_state=0).fit_transform(embeddings)
        elif algorithm == 'tsne':
            embeddings = TSNE(n_components=2, init='pca', random_state=0).fit_transform(embeddings)
    
    # Convert to dataframe
    embeddings = pd.DataFrame(embeddings, columns=['x', 'y'])
    return embeddings

In [None]:
podcast_embeddings_2d = reduce_embed_to_2d(podcast_embeddings)
podcast_embeddings_2d.shape

In [None]:
# Add podcast attributes
podcast_embeddings_2d[['country_fullname', 'podcast_id', 'im_name_label', 'summary_label', 'summary_label_en_cleaned', 'link_attributes_href', 'feedurl', 'artwork']] = podcasts[['country_fullname', 'podcast_id', 'im_name_label', 'summary_label', 'summary_label_en_cleaned', 'link_attributes_href', 'feedurl', 'artwork']].copy()

# Only keep the first few sentences of each description
def cut_sents(sents, max_len=150):
    if len(sents) <= 1:
        return sents[0] + ' ...'
    cutoff = np.where(np.cumsum([len(sent) for sent in sents]) >= max_len)[0]
    if len(cutoff) > 0:
        sents = sents[:(cutoff[0]+1)]
    return ' '.join(sents) + ' ...'

# Add hover text
podcast_embeddings_2d['summary_label_en_cleaned_brief'] = [cut_sents(sents) for sents in podcast_embeddings_2d['summary_label_en_cleaned'].tolist()]
podcast_embeddings_2d['summary_label_en_cleaned_brief'] = podcast_embeddings_2d['summary_label_en_cleaned_brief'].str.wrap(50).str.replace('\n', '<br>')
podcast_embeddings_2d['hover_text'] = podcast_embeddings_2d['im_name_label'] + '<br>' + podcast_embeddings_2d['summary_label_en_cleaned_brief']

# Clean up summaries
podcast_embeddings_2d['summary_label'] = podcast_embeddings_2d['summary_label'].str.join(' ')
podcast_embeddings_2d['summary_label_en_cleaned'] = podcast_embeddings_2d['summary_label_en_cleaned'].str.join(' ')

In [None]:
# Make sure the same podcast has the same (x, y)
podcast_embeddings_2d_unique = podcast_embeddings_2d.groupby('podcast_id')[['x', 'y']].mean().reset_index()
podcast_embeddings_2d = pd.merge(podcast_embeddings_2d.drop(['x', 'y'], axis=1).reset_index(), podcast_embeddings_2d_unique)
podcast_embeddings_2d.sort_values('index', inplace=True)
podcast_embeddings_2d.shape

In [None]:
# Remove podcasts that are not in the country list
podcast_embeddings_2d = podcast_embeddings_2d[podcast_embeddings_2d['country_fullname'].isin(countries.values())].copy()
podcast_embeddings_2d.shape

In [None]:
# Only keep the top N podcasts per country
n_podcasts = 30
podcast_embeddings_2d = podcast_embeddings_2d.groupby('country_fullname').head(n_podcasts)
podcast_embeddings_2d.shape

In [None]:
# Save plot data
podcast_embeddings_2d.to_csv('data/podcast_embeddings_2d.csv', index=False)

In [None]:
# Extract podcast descriptions
podcast_id_to_desc = podcast_embeddings_2d[['podcast_id', 'im_name_label', 'summary_label', 'summary_label_en_cleaned', 'link_attributes_href', 'feedurl', 'artwork']].copy()
podcast_id_to_desc['is_us'] = podcast_embeddings_2d['country_fullname'] == 'United States'
podcast_id_to_desc.sort_values(['podcast_id', 'is_us'], ascending=[True, False], inplace=True)
podcast_id_to_desc = podcast_id_to_desc.drop('is_us', axis=1).drop_duplicates(subset='podcast_id', keep='first')
podcast_id_to_desc.shape

In [None]:
# Convert to dict
podcast_id_to_desc = podcast_id_to_desc.set_index('podcast_id').to_dict('index')
for podcast in podcast_id_to_desc.values():
    if podcast['summary_label'] == podcast['summary_label_en_cleaned']:
        podcast.pop('summary_label_en_cleaned')

In [None]:
# Save
pickle.dump(podcast_id_to_desc, open('data/podcast_id_to_desc.pkl', 'wb'), protocol=2)

## Clean up episodes data for display

In [None]:
def pair_summary_w_en(episode):
    pairs = pd.DataFrame(zip(episode['summary'], episode['summary_en_cleaned']), columns=['original', 'translated']).to_dict('records')
    for pair in pairs:
        if pair['original'] == pair['translated']:
            pair.pop('translated')
    return pairs

In [None]:
podcast_id_to_episodes_cleaned = {}
n_episodes = 30
for podcast_id in podcast_embeddings_2d['podcast_id'].unique():
    episodes = podcast_id_to_episodes[podcast_id].copy()
    
    # Keep only the recent episodes
    episodes = episodes.head(n_episodes)
    
    # Clean up
    episodes = episodes[['date', 'title', 'title_en_cleaned', 'summary', 'summary_en_cleaned', 'link']]
    episodes['date'] = pd.to_datetime(episodes['date'], utc=True).dt.date.astype('str')
    episodes['title'] = episodes['title'].str.join(' ')
    episodes['title_en_cleaned'] = episodes['title_en_cleaned'].str.join(' ')
    
    # Convert to dict
    episodes = episodes.to_dict('records')

    # Pair summary and its translations
    for episode in episodes:
        if episode['title'] == episode['title_en_cleaned']:
            episode.pop('title_en_cleaned')
        
        episode['summary_w_en'] = pair_summary_w_en(episode)
        episode.pop('summary')
        episode.pop('summary_en_cleaned')
    
    podcast_id_to_episodes_cleaned[podcast_id] = episodes

len(podcast_id_to_episodes_cleaned)

In [None]:
# Save
pickle.dump(podcast_id_to_episodes_cleaned, open('data/podcast_id_to_episodes_cleaned.pkl', 'wb'), protocol=2)