In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

In [None]:
import requests
import numpy as np
import pandas as pd
import pickle
from polyglot.detect import Detector
from polyglot.detect.base import UnknownLanguage
from google.cloud import translate
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'cred/translate-cred.json'

In [None]:
import sys
sys.path.append('utils')
from preprocess import *

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.options.display.max_rows = 300

## Load processed data

In [None]:
podcast_data = pickle.load(open('data/podcast_data_proc.pkl', 'rb'))
podcasts, podcast_id_to_episodes = podcast_data['podcasts'], podcast_data['podcast_id_to_episodes']
podcasts.shape, len(podcast_id_to_episodes)

## Full-join with the previously-translated data

In [None]:
# Load previously-translated data
podcast_data_translated = pickle.load(open('data/podcast_data_translated.pkl', 'rb'))
podcasts_translated, podcast_id_to_episodes_translated = podcast_data_translated['podcasts'], podcast_data_translated['podcast_id_to_episodes']
podcasts_translated.shape, len(podcast_id_to_episodes_translated)

In [None]:
def combine_two_dfs(df_1, df_2, id_col, sort_cols, ascending=True):
    df_1 = df_1.copy()
    df_2 = df_2.copy()
    
    # Make sure that `df_2` includes all the columns in `df_1`
    assert np.array_equal(sorted(df_1.columns), sorted(np.intersect1d(df_1.columns, df_2.columns)))

    # Create placeholder columns for `df_1`
    for col in np.setdiff1d(df_2.columns, df_1.columns):
        df_1[col] = np.nan

    # De-dupe `df_2`
    df_2.drop_duplicates(id_col, inplace=True)

    # Index using the ID column
    df_1.reset_index(inplace=True)
    df_2.reset_index(inplace=True)
    df_1 = df_1.set_index(id_col)
    df_2 = df_2.set_index(id_col)

    # Combine
    df_1['source'] = 0
    df_2['source'] = 1
    df = df_1.combine_first(df_2)

    # Clean up
    df.sort_values(sort_cols, ascending=ascending, inplace=True)
    df = df[df_1.columns]
    df.drop(['index', 'source'], axis=1, inplace=True)
    df.reset_index(inplace=True)
    
    return df

In [None]:
# Combine podcasts
podcasts = combine_two_dfs(podcasts, podcasts_translated, id_col='podcast_id', sort_cols=['country', 'source', 'index'])
podcasts.shape

In [None]:
# Combine episodes
for podcast_id in podcasts['podcast_id'].unique():
    if podcast_id in podcast_id_to_episodes_translated:
        if podcast_id in podcast_id_to_episodes:
            podcast_id_to_episodes[podcast_id] = combine_two_dfs(podcast_id_to_episodes[podcast_id], podcast_id_to_episodes_translated[podcast_id], id_col='episode_id', sort_cols=['date'], ascending=False)
        else:
            # If a podcast is not found among the top anymore, do not update or translate its episodes
            podcast_id_to_episodes[podcast_id] = podcast_id_to_episodes_translated[podcast_id].copy()
    else:
        for col in ['title_en', 'title_en_cleaned', 'summary_en', 'summary_en_cleaned', 'summary_en_cleaned_deduped']:
            podcast_id_to_episodes[podcast_id][col] = np.nan

len(podcast_id_to_episodes)

## Translate

In [None]:
translate_client = translate.Client()

In [None]:
def translate_using_google(text):
    translation = translate_client.translate(text, target_language='en', model='nmt')
    return translation['detectedSourceLanguage'], translation['translatedText']

def translate(text, detect_lang=False):
    try:
        # First detect the language offline
        detected = Detector(text)
        lang, reliable = detected.language.code, detected.reliable
        
        # Only translate if not in English
        if lang != 'en' or not reliable:
            translated = translate_using_google(text)
        else:
            translated = ('en', text)

    except UnknownLanguage:
        translated = translate_using_google(text)

    except Exception:
        translated = ('en', text)

    if detect_lang:
        return translated
    return translated[1]

In [None]:
def translate_sents(sents, detect_lang=False):
    if sents:
        translated = [translate(sent, detect_lang) for sent in sents]
        if detect_lang:
            # Seperate the detected language from the translation
            src_lang, translated_sents = zip(*translated)

            # Find the primary language
            src_lang = pd.Series(src_lang).value_counts().index[0]

            # Return the tuple
            return src_lang, list(translated_sents)
        return translated
    else:
        if detect_lang:
            return np.nan, []
        return []

In [None]:
def translate_df(df, cols_to_translate, detect_langs):
    df = df.copy()
    for col, detect_lang in zip(cols_to_translate, detect_langs):
        # Only translate the records that have not been translated
        if col+'_en' not in df:
            df[col+'_en'] = np.nan
        if detect_lang:
            if 'src_lang' not in df:
                df['src_lang'] = np.nan
            translated = df.loc[df[col+'_en'].isnull(), col].apply(lambda x: translate_sents(x, True)).tolist()
            if translated:
                df.loc[df[col+'_en'].isnull(), ['src_lang', col+'_en']] = translated
        else:
            df.loc[df[col+'_en'].isnull(), col+'_en'] = df.loc[df[col+'_en'].isnull(), col].apply(translate_sents)
    return df

In [None]:
# Translate podcast summaries
podcasts = translate_df(podcasts, cols_to_translate=['summary_label'], detect_langs=[False])

In [None]:
# Translate episode titles and summaries
for podcast_id in podcast_id_to_episodes:
    podcast_id_to_episodes[podcast_id] = translate_df(podcast_id_to_episodes[podcast_id], cols_to_translate=['title', 'summary'], detect_langs=[False, True])

In [None]:
# Save
podcast_data = {
    'podcasts': podcasts,
    'podcast_id_to_episodes': podcast_id_to_episodes
}

pickle.dump(podcast_data, open('data/podcast_data_proc.pkl', 'wb'))

## Clean translations

In [None]:
def clean_sents(sents):
    return [clean_text(sent) for sent in sents]

In [None]:
# Clean podcast summaries
podcasts['summary_label_en_cleaned'] = podcasts['summary_label_en'].apply(clean_sents)

In [None]:
# Clean episode titles and summaries
for podcast_id in podcast_id_to_episodes:
    podcast_id_to_episodes[podcast_id]['title_en_cleaned'] = podcast_id_to_episodes[podcast_id]['title_en'].apply(clean_sents)
    podcast_id_to_episodes[podcast_id]['summary_en_cleaned'] = podcast_id_to_episodes[podcast_id]['summary_en'].apply(clean_sents)

In [None]:
# Remove duplicate summaries among episodes
for episodes in podcast_id_to_episodes.values():
    episodes['summary_en_cleaned_deduped'] = remove_duplicate_summaries(episodes['summary_en_cleaned'].tolist(), episodes['title_en_cleaned'].tolist(), dedupe_within_summaries=True, need_sent_tokenization=False)

In [None]:
!cp data/podcast_data_translated.pkl data/podcast_data_translated-OLD.pkl

In [None]:
# Save
podcast_data = {
    'podcasts': podcasts,
    'podcast_id_to_episodes': podcast_id_to_episodes
}

pickle.dump(podcast_data, open('data/podcast_data_translated.pkl', 'wb'))