In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [None]:
import numpy as np
import pandas as pd
import pickle
import itertools
import spacy
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import string
import math
import plotly
import plotly.graph_objs as go
import colorlover as cl
from IPython.display import IFrame 

In [None]:
import sys
sys.path.append('utils')
from preprocess import *

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 300

## Load translated data

In [None]:
podcast_data = pickle.load(open('data/podcast_data_translated.pkl', 'rb'))
podcasts, podcast_id_to_episodes = podcast_data['podcasts'], podcast_data['podcast_id_to_episodes']
podcasts.shape, len(podcast_id_to_episodes)

In [None]:
podcasts['country_fullname'].value_counts()

## Combine podcast descriptions and episodes

In [None]:
# First, combine episode titles and summaries
podcast_id_to_episodes_combined = {}
for podcast_id, episodes in podcast_id_to_episodes.items():
    # Combine titles and summaries for each episode
    title_summary_en_cleaned = episodes['title_en_cleaned'] + episodes['summary_en_cleaned_deduped']
    
    # Concatenate all episodes
    podcast_id_to_episodes_combined[podcast_id] = list(itertools.chain(*title_summary_en_cleaned.tolist()))

len(podcast_id_to_episodes_combined)

In [None]:
# Then, combine episodes with podcast summaries
episodes_en_cleaned = podcasts['podcast_id'].map(podcast_id_to_episodes_combined)
summary_episodes_en_cleaned = podcasts['summary_label_en_cleaned'] + episodes_en_cleaned
len(summary_episodes_en_cleaned)

## Tokenize combined podcast summaries

In [None]:
spacy_en = spacy.load('en_core_web_sm')

In [None]:
def tokenize_list_of_sents_w_spacy(sents):
    return [tokens for tokens in spacy_en.pipe(sents, n_threads=10)]

In [None]:
# Tokenize combined episodes with podcast summaries
podcasts['summary_episodes_en_cleaned'] = summary_episodes_en_cleaned.apply(tokenize_list_of_sents_w_spacy)

In [None]:
# Save
podcast_data = {
    'podcasts': podcasts,
    'podcast_id_to_episodes': podcast_id_to_episodes
}

pickle.dump(podcast_data, open('data/podcast_data_translated.pkl', 'wb'))

## Compare noun phrases

In [None]:
podcast_data = pickle.load(open('data/podcast_data_translated.pkl', 'rb'))
podcasts, podcast_id_to_episodes = podcast_data['podcasts'], podcast_data['podcast_id_to_episodes']
podcasts.shape, len(podcast_id_to_episodes)

In [None]:
# Remove podcasts that are not in the country list
podcasts = podcasts[podcasts['country'].isin(countries)].copy()
podcasts.shape

In [None]:
podcasts['country_fullname'].value_counts()

In [None]:
# Extract noun phrases
def extract_noun_phrases(sent):
    phrases = [token for token in sent.noun_chunks]
    
    # Only keep nouns (specifically their lemmas)
    phrases = [[token.lemma_.lower() for token in phrase if token.pos_ in ['NOUN']] for phrase in phrases]
    
    # Clean up
    return [' '.join(phrase) for phrase in phrases if phrase]

In [None]:
summary_episodes_en_cleaned_phrases = podcasts['summary_episodes_en_cleaned'].apply(lambda sents: list(itertools.chain(*[extract_noun_phrases(sent) for sent in sents])))

In [None]:
# Count noun phrases
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda text: text, lowercase=False, min_df=5, stop_words="english")
phrase_count = tfidf_vectorizer.fit_transform(summary_episodes_en_cleaned_phrases)
phrase_count.shape

In [None]:
# Clean up
phrase_count = phrase_count.toarray()
phrase_count = pd.DataFrame(phrase_count, columns=tfidf_vectorizer.get_feature_names())

patterns_to_rm = ['issue', 'episode', 'podcast', 'program', 'trailer', 'floor', 'section', 'addition', 'guest', 'listener', 'host', 'first order', 'tune', 'editor', 'discount']
phrase_count = phrase_count[[col for col in phrase_count if not re.search('|'.join(patterns_to_rm), col) and not col in string.punctuation]]

phrase_count['podcast_id'] = podcasts['podcast_id'].tolist()
phrase_count.drop_duplicates('podcast_id', inplace=True)
phrase_count.shape

In [None]:
# Reshape to long
phrase_count_lng = phrase_count.melt(id_vars='podcast_id', var_name='phrase', value_name='tfidf')
phrase_count_lng.sort_values(['podcast_id', 'tfidf'], ascending=[True, False], inplace=True)
phrase_count_lng.shape

In [None]:
# Keep the top ones
top_n = 10
phrase_count_top = phrase_count_lng.groupby('podcast_id').head(top_n)

In [None]:
# Convert to dict
podcast_id_to_phrase_count = phrase_count_top.groupby('podcast_id')['phrase'].apply(lambda x: x.tolist()).to_dict()

In [None]:
# Add colors
def add_colors(n, offset=3):
    colors = cl.scales['9']['seq']['Blues']
    colors = cl.interp(colors, n+offset)
    colors = cl.to_numeric(cl.to_rgb(colors))
    colors = ['#%02x%02x%02x' % tuple(int(c) for c in color) for color in colors]
    return colors[::-1][:-offset]

In [None]:
for podcast_id, phrases in podcast_id_to_phrase_count.items():
    podcast_id_to_phrase_count[podcast_id] = list(zip(phrases, add_colors(len(phrases))))

len(podcast_id_to_phrase_count)

In [None]:
# Save
pickle.dump(podcast_id_to_phrase_count, open('data/podcast_id_to_phrase_count.pkl', 'wb'), protocol=2)