In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from ast import literal_eval

# Using https://huggingface.co/sentence-transformers
from sentence_transformers import SentenceTransformer

from nltk import sent_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [9]:
df_prepro = pd.read_csv("../data/ted_talks_preprocessed.csv")
df_prepro.head(2)

Unnamed: 0,talk_id,title,speaker,occupations,about_speaker,views,recorded_date,published_date,event,available_lang,comments,duration,topics,related_talks,description,transcript
0,1,Averting the climate crisis,Al Gore,['climate advocate'],Nobel Laureate Al Gore focused the world’s att...,3523392,2006-02-25,2006-06-27,TED2006,"['ar', 'bg', 'cs', 'de', 'el', 'en', 'es', 'fa...",272,977,"['alternative energy', 'cars', 'climate change...","[243, 547, 2093, 54715, 29968, 2339]",With the same humor and humanity he exuded in ...,"Thank you so much, Chris. And it's truly a gre..."
1,2,Simple designs to save a life,Amy Smith,"['inventor', 'engineer']","Amy Smith designs cheap, practical fixes for t...",1724438,2006-02-24,2006-08-15,TED2006,"['ar', 'bg', 'ca', 'de', 'el', 'en', 'es', 'fa...",101,906,"['MacArthur grant', 'alternative energy', 'des...","[1561, 1072, 1184, 1406, 767, 285]",Fumes from indoor cooking fires kill more than...,"In terms of invention, I'd like to tell you th..."


In [10]:
df_prepro.occupations = df_prepro.occupations.apply(literal_eval)
df_prepro.available_lang = df_prepro.available_lang.apply(literal_eval)
df_prepro.topics = df_prepro.topics.apply(literal_eval)
df_prepro.related_talks = df_prepro.related_talks.apply(literal_eval)

df_prepro.recorded_date = pd.to_datetime(df_prepro.recorded_date)
df_prepro.published_date = pd.to_datetime(df_prepro.published_date)

## Occupations

In [4]:
df_prepro.occupations[df_prepro.occupations.map(len) > 1]

1                                [inventor, engineer]
5                       [biologist, genetics pioneer]
7                    [architect, experience designer]
9                                 [physician, author]
10                  [primatologist, environmentalist]
                            ...                      
3917                             [author, journalist]
3939                 [classicist, political theorist]
3945    [psychotherapist, psychosomatic psychiatrist]
3949                            [artist, storyteller]
3953                 [epidemiologist, philanthropist]
Name: occupations, Length: 719, dtype: object

In [5]:
def boolean_df(item_lists, unique_items):
    bool_dict = {}
    
    for item in unique_items:
        bool_dict[item] = item_lists.apply(lambda x: item in x)
    
    return pd.DataFrame(bool_dict)

In [6]:
occupations_one_hot = boolean_df(df_prepro.occupations, df_prepro.occupations.explode().unique())
occupations_one_hot

Unnamed: 0,climate advocate,inventor,engineer,president-elect of afghanistan,aircraft engineer,car designer,biologist,genetics pioneer,technology columnist,architect,...,social designer,neurodiversity advocate,digital marketing consultant,farmer,infectious disease scientist,interdisciplinary researcher,classicist,law researcher,psychosomatic psychiatrist,political strategist
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3952,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3953,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3954,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3955,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Discussion rate

In [7]:
df_prepro["discussion_rate"] = df_prepro.comments / df_prepro.views

## Country

In [8]:
event_country_mapping = pd.read_csv("../data/event_country_mapping.csv")

df_prepro = pd.merge(df_prepro, event_country_mapping, on="event")
df_prepro.drop("event", axis=1, inplace=True)

In [9]:
pd.get_dummies(df_prepro.country)

Unnamed: 0,Africa,Argentina,Australia,Austria,Austrich,Belgium,Bulgaria,Canada,China,Denmark,...,Scotland,Singapore,Slovakia,South Africa,Spain,Sweden,Switzerland,USA,United Arab Emirates,United States
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3940,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Description

In [None]:
# TODO: remove (Applause), etc.

In [4]:
bert = SentenceTransformer('bert-base-nli-mean-tokens')

In [12]:
desc_embeds = bert.encode(df_prepro.description)
desc_embeds.shape

In [15]:
np.save("../data/desc_embeddings.npy", desc_embeds)

# About speaker

In [37]:
speak_embeds = bert.encode(df_prepro.about_speaker)
speak_embeds.shape

(3957, 768)

In [38]:
np.save("../data/embeddings/speak_embeddings.npy", speak_embeds)

# Transcript

In [18]:
analyser = SentimentIntensityAnalyzer()

def compound_mean(transcript):
    sentences = sent_tokenize(transcript)
    return np.mean([analyser.polarity_scores(sentence)['compound'] for sentence in sentences])

In [19]:
tqdm.pandas()

sentiments = df_prepro.transcript.progress_apply(compound_mean)

100%|██████████| 3957/3957 [02:56<00:00, 22.48it/s]


In [31]:
# np.save("../data/embeddings/sentiments.npy", sentiments)
sentiments = np.load("../data/embeddings/sentiments.npy")

In [7]:
np.argmin(sentiments)

3241

# Dates

In [15]:
df_prepro.published_date.apply(lambda date: date.timestamp())

0       1.151366e+09
1       1.155600e+09
2       1.161130e+09
3       1.161734e+09
4       1.175731e+09
            ...     
3952    1.587600e+09
3953    1.587686e+09
3954    1.587946e+09
3955    1.588205e+09
3956    1.588205e+09
Name: published_date, Length: 3957, dtype: float64

In [27]:
df_prepro.apply(lambda elt: elt.views / (pd.to_datetime("2020-05-01") - elt.published_date).days, axis=1)

0          696.735614
1          344.336661
2          198.608414
3          491.795422
4          204.917906
            ...      
3952     27396.375000
3953    108709.857143
3954     76796.750000
3955     56582.000000
3956         0.000000
Length: 3957, dtype: float64