In [1]:
import pandas as pd
import numpy as np

import re

In [3]:
data = pd.read_csv('articles_eng_2020.csv').dropna()

In [7]:
data

Unnamed: 0.1,Unnamed: 0,id,date,headline,content
0,0,bnl-newyorktimes528-20200216-489baa4a905a,2020-02-16 00:00:00+00:00,Questioning CPR as a Default Response,<i><b>DR. MONIQUE STARKS</b>&nbsp;DUKE UNIVERS...
1,1,bnl-chicagotribune-20200219-89419fb1,2020-02-19 00:00:00+00:00,‘Taking Sexy Back’ a fantastic and timely book,"If I didn’t know better, I would think Alexand..."
2,2,bnl-atavist-20200226-5e52a7d36b667,2020-02-26 00:00:00+00:00,Deliverance,<b>Devilry of the kind</b> necessary to kill a...
3,3,bnl-economist-20200131-7f87b05bd7e,2020-01-31 00:00:00+00:00,Whendunnit?,Since the first use of fingerprints to identif...
4,4,bnl-fastcompany-20200201-ba11405d1ed,2020-02-01 00:00:00+00:00,EXPERIENCE MATTERS,<strong>IN DIGITAL</strong> With support from ...
...,...,...,...,...,...
157746,157746,bnl-areweeurope-20201224-f2b212e39c7,2020-12-24 00:00:00+00:00,The Black Curriculum,<strong>AUTHOR'S MOTIVATION</strong> <em>I rec...
157747,157747,bnl-time-20200613-c8d5928f67d,2020-06-13 00:00:00+00:00,Which countries have handled COVID-19 best?,The coronavirus crisis has shifted To The amer...
157748,157748,bnl-newyorktimes528-20200312-100000007026228,2020-03-12 00:00:00+00:00,What Does the Coronavirus Do to the Body?,Here’s what scientists have learned about how ...
157749,157749,bnl-newyorktimes528-20200529-100000007160839,2020-05-29 00:00:00+00:00,Playing by the Rules: Dutch Leader Offers a So...,At a time when populists are overturning socia...


In [8]:
headlines = data.headline.tolist()

In [10]:
print(f'There are {len(headlines)} headlines.')

There are 157678 headlines.


In [13]:
headlines[0]

'Questioning CPR as a Default Response'

In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(headlines, show_progress_bar=True)

Batches:   0%|          | 0/4928 [00:00<?, ?it/s]

In [15]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)

In [16]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [19]:
docs_df = pd.DataFrame(headlines, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [21]:
docs_per_topic

Unnamed: 0,Topic,Doc
0,-1,Questioning CPR as a Default Response ‘Taking ...
1,0,Crossword Crossword Crossword Crossword Crossw...
2,1,Quote of the Day Quote of the Day Quote of the...
3,2,Your Thursday Briefing Your Thursday Briefing ...
4,3,Contributors Contributors Contributors CONTRIB...
...,...,...
625,624,"Citing security, Texas governor limits countie..."
626,625,"UN DEFEATED Two Classes, Unequal Disappearing ..."
627,626,Condé Nast to Limit the Use of NDAs Ban on Ass...
628,627,"For Hilary Mantel, There’s No Time Like the Pa..."


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))

In [23]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

Unnamed: 0,Topic,Size
0,-1,83522
612,611,4396
315,314,3637
568,567,2893
170,169,2660
177,176,2634
628,627,2309
557,556,1309
545,544,1140
464,463,1063


In [30]:
print(f'Topic sizes: {len(topic_sizes)}')
print(f'Top n words: {len(top_n_words)}')

Topic sizes: 630
Top n words: 630


In [24]:
top_n_words[611][:10]

[('trump', 0.48526907351595616),
 ('biden', 0.06457523900924203),
 ('says', 0.03086407005228113),
 ('election', 0.026731044584863164),
 ('campaign', 0.025171083849548435),
 ('president', 0.01951832166083292),
 ('virus', 0.018665354260638693),
 ('donald', 0.018071206131971964),
 ('republicans', 0.017973013070429214),
 ('court', 0.017913051675852096)]

In [25]:
top_n_words[314][:10]

[('million', 0.15543460155763014),
 ('000', 0.12306688549852283),
 ('billion', 0.10252409506396486),
 ('100', 0.04248442463507839),
 ('cases', 0.03836327682747241),
 ('trillion', 0.03279847589564103),
 ('millions', 0.03141715866489043),
 ('pay', 0.03134589737079452),
 ('coronavirus', 0.029698749127208865),
 ('virus', 0.027464525448558933)]

In [26]:
top_n_words[567][:10]

[('good', 0.0625891394040202),
 ('love', 0.0548250077601877),
 ('right', 0.04816663317955727),
 ('big', 0.04596380277270793),
 ('time', 0.0346619806031849),
 ('real', 0.032555224733677786),
 ('perfect', 0.02534380059934966),
 ('power', 0.023499638835091857),
 ('hope', 0.02235139533888178),
 ('great', 0.02164111839715918)]

In [28]:
top_n_words[176][:10]

[('dies', 0.3995326471110104),
 ('dead', 0.11086330233323792),
 ('90', 0.036863478136321993),
 ('88', 0.03651675244158001),
 ('91', 0.03628405469817675),
 ('84', 0.033675504675345636),
 ('92', 0.030893269112689033),
 ('81', 0.030353007295590168),
 ('86', 0.029165247260634276),
 ('83', 0.029134598663700884)]

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
for i in range(20):
    # Calculate cosine similarity
    similarities = cosine_similarity(tf_idf.T)
    np.fill_diagonal(similarities, 0)

    # Extract label to merge into and from where
    topic_sizes = docs_df.groupby(['Topic']).count().sort_values("Doc", ascending=False).reset_index()
    topic_to_merge = topic_sizes.iloc[-1].Topic
    topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

    # Adjust topics
    docs_df.loc[docs_df.Topic == topic_to_merge, "Topic"] = topic_to_merge_into
    old_topics = docs_df.sort_values("Topic").Topic.unique()
    map_topics = {old_topic: index - 1 for index, old_topic in enumerate(old_topics)}
    docs_df.Topic = docs_df.Topic.map(map_topics)
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # Calculate new topic words
    m = len(data)
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m)
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)

topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)

Unnamed: 0,Topic,Size
0,-1,83537
592,591,4396
311,310,3637
548,547,2893
169,168,2660
176,175,2634
608,607,2309
537,536,1309
525,524,1140
448,447,1063


In [32]:
len(topic_sizes)

610

In [33]:
top_n_words[591][:10]

[('trump', 0.48526907351595616),
 ('biden', 0.06457523900924203),
 ('says', 0.03086407005228113),
 ('election', 0.026731044584863164),
 ('campaign', 0.025171083849548435),
 ('president', 0.01951832166083292),
 ('virus', 0.018665354260638693),
 ('donald', 0.018071206131971964),
 ('republicans', 0.017973013070429214),
 ('court', 0.017913051675852096)]

In [34]:
top_n_words[310][:10]

[('million', 0.15543460155763014),
 ('000', 0.12306688549852283),
 ('billion', 0.10252409506396486),
 ('100', 0.04248442463507839),
 ('cases', 0.03836327682747241),
 ('trillion', 0.03279847589564103),
 ('millions', 0.03141715866489043),
 ('pay', 0.03134589737079452),
 ('coronavirus', 0.029698749127208865),
 ('virus', 0.027464525448558933)]

In [35]:
top_n_words[547][:10]

[('good', 0.0625891394040202),
 ('love', 0.0548250077601877),
 ('right', 0.04816663317955727),
 ('big', 0.04596380277270793),
 ('time', 0.0346619806031849),
 ('real', 0.032555224733677786),
 ('perfect', 0.02534380059934966),
 ('power', 0.023499638835091857),
 ('hope', 0.02235139533888178),
 ('great', 0.02164111839715918)]

In [36]:
top_n_words[168][:10]

[('china', 0.5050655855140077),
 ('chinese', 0.14310070693761606),
 ('hong', 0.12579089890496906),
 ('kong', 0.12503083305055837),
 ('beijing', 0.07495979382782324),
 ('coronavirus', 0.05132064771140595),
 ('taiwan', 0.042010767429444394),
 ('virus', 0.03613170017110871),
 ('trade', 0.027113655625539704),
 ('new', 0.02526858319372172)]