In [1]:
import pandas as pd
from top2vec import Top2Vec

In [2]:
df = pd.read_csv('./Data/covid19_articles_20201231.csv')

# date to datetime
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369047 entries, 0 to 369046
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   author      181799 non-null  object        
 1   date        369047 non-null  datetime64[ns]
 2   domain      369047 non-null  object        
 3   title       368962 non-null  object        
 4   url         369047 non-null  object        
 5   content     369047 non-null  object        
 6   topic_area  369047 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 19.7+ MB


In [33]:
# Drop duplicates
num_articles_pre_drop = len(df)
df = df.drop_duplicates(subset='content')
df = df.reset_index(drop=True)
print('no. articles dropped: ', num_articles_pre_drop - len(df))
print('no. of articles: ', len(df))

no. articles dropped:  127
no of articles:  368920


In [30]:
df.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business


In [34]:
#df_top = df[df['date'].between('2020-01-01', '2020-01-31')]
df_top = df[df['date'].between('2020-01-01', '2020-03-31')]
print('no. of articles: ',len(df_top))
df_top = df_top[df_top['content'].apply(lambda x: len(x) <= 1000000)]
df_top = df_top.reset_index(drop=True)
print('no. of articles after dropping long articles: ', len(df_top)) # drop articles where text longer than 1,000,000 characters

no. of articles:  48778
no. of articles after dropping long articles:  48776


### top2vec paramaters

https://top2vec.readthedocs.io/en/latest/api.html  
classtop2vec.Top2Vec.Top2Vec(documents, min_count=50, embedding_model='doc2vec', embedding_model_path=None, speed='learn', use_corpus_file=False, document_ids=None, keep_documents=True, workers=None, tokenizer=None, use_embedding_model_tokenizer=False, umap_args=None, hdbscan_args=None, verbose=True)

- speed: fast-learn (lowest quality), learn, deep-learn (best quality but significant time)
- workers: worker thread to be used in training

Considerations
- to use a pre-trained model or not

In [88]:
model = Top2Vec(list(df_top['content']), workers=8)

2021-03-13 16:13:59,090 - top2vec - INFO - Pre-processing documents for training
2021-03-13 16:15:24,585 - top2vec - INFO - Creating joint document/word embedding
2021-03-13 19:19:32,456 - top2vec - INFO - Creating lower dimension embedding of documents
2021-03-13 19:20:19,468 - top2vec - INFO - Finding dense areas of documents
2021-03-13 19:20:21,938 - top2vec - INFO - Finding topics


In [89]:
# Save trained model
filepath = './Data/top2vec_model'
model.save(file = filepath)

In [8]:
# Load trained model
filepath = './Data/top2vec_model'
model = Top2Vec.load(filepath)

In [9]:
# Number of topics found
num_topics = model.get_num_topics()
num_topics

424

In [10]:
# Topic sizes - number of documents most similar to each topic
topic_sizes, topic_nums = model.get_topic_sizes()
print(sum(topic_sizes))
model.get_topic_sizes()

48776


(array([633, 550, 548, 484, 453, 452, 412, 407, 402, 393, 388, 387, 387,
        376, 374, 368, 359, 357, 356, 321, 317, 312, 308, 298, 296, 291,
        286, 285, 281, 278, 275, 273, 268, 266, 263, 258, 256, 245, 244,
        244, 241, 238, 235, 231, 231, 230, 229, 223, 222, 220, 219, 217,
        217, 214, 209, 209, 207, 207, 206, 204, 199, 198, 197, 197, 193,
        188, 187, 186, 183, 181, 179, 178, 176, 172, 171, 170, 169, 168,
        167, 166, 158, 157, 157, 156, 156, 156, 155, 152, 152, 150, 150,
        149, 148, 148, 147, 147, 146, 146, 146, 144, 144, 144, 144, 143,
        143, 142, 142, 142, 140, 139, 139, 138, 138, 135, 133, 133, 132,
        131, 130, 130, 130, 129, 128, 126, 126, 126, 125, 125, 125, 125,
        123, 123, 121, 120, 120, 119, 119, 119, 119, 119, 119, 118, 118,
        118, 118, 117, 117, 117, 116, 116, 116, 116, 116, 115, 115, 115,
        114, 113, 113, 112, 112, 112, 110, 109, 108, 106, 104, 103, 103,
        103, 103, 103, 103, 101, 101,  99,  98,  98

### Get topics
- topic words = top 50 words for each topic, 
- word_scores = cosine similarity scores for topic words, 
- topic nums = index of topics

In [11]:
# Get topics
topic_words, word_scores, topic_nums = model.get_topics(num_topics)
print(len(topic_words))
model.get_topics(num_topics)

424


(array([['airline', 'airlines', 'iata', ..., 'bookings', 'rucinski',
         'attendants'],
        ['duchess', 'royals', 'prince', ..., 'commonwealth', 'eugenie',
         'invictus'],
        ['opec', 'bpd', 'barrels', ..., 'moscow', 'oversupply', 'dmitry'],
        ...,
        ['arora', 'nigam', 'chart', ..., 'amd', 'nvidia', 'amzn'],
        ['plaintiff', 'nclh', 'norwegian', ..., 'lie', 'rosen', 'action'],
        ['finablr', 'travelex', 'nmc', ..., 'saumyadeb', 'attackers',
         'branches']], dtype='<U15'),
 array([[0.7667269 , 0.73207396, 0.6437674 , ..., 0.435865  , 0.43551034,
         0.43436494],
        [0.79871535, 0.79680306, 0.78776336, ..., 0.540224  , 0.5389048 ,
         0.5316409 ],
        [0.79281396, 0.7460102 , 0.7294325 , ..., 0.4632243 , 0.46132773,
         0.454877  ],
        ...,
        [0.8479747 , 0.7927296 , 0.59192365, ..., 0.31721187, 0.3144494 ,
         0.31423053],
        [0.6849759 , 0.6231576 , 0.60474974, ..., 0.29230383, 0.29055232,
    

### Search for similar topics by word

In [18]:
#topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["strategy", "digitalisation"], num_topics=5)
#topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["digitalisation"], num_topics=5)
#topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["supply chain"], num_topics=5)
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["supply"], num_topics=5)

print(topic_nums)
print(topic_scores)
print(len(topic_words))
topic_words

[213 174 285 237 329]
[0.37212503 0.35659454 0.29771573 0.27559688 0.27439119]
5


[array(['generic', 'apis', 'drugmakers', 'pharmaceutical', 'ingredients',
        'drugs', 'medicines', 'generics', 'formulations', 'mylan', 'api',
        'drug', 'ingredient', 'drugmaker', 'teva', 'shortages',
        'manufacturers', 'medications', 'chloroquine', 'prescription',
        'dosage', 'orally', 'paracetamol', 'novartis', 'tablets', 'lupus',
        'kaletra', 'hahn', 'pharmaceuticals', 'antibiotics', 'fda',
        'medication', 'biosimilar', 'malaria', 'prescribing', 'phosphate',
        'gsk', 'dua', 'manufacturing', 'pfizer', 'malarial', 'pharma',
        'pharmacist', 'supply', 'antimalarial', 'rheumatoid', 'raw',
        'india', 'actemra', 'supplier'], dtype='<U15'),
 array(['respirator', 'respirators', 'surgical', 'gowns', 'shields',
        'supplies', 'shortage', 'protective', 'filter', 'ffp', 'mask',
        'goggles', 'shortages', 'ppe', 'stockpile', 'facemasks', 'gear',
        'visors', 'gloves', 'manufacturers', 'masks', 'reuse', 'scrubs',
        'homemade

### Search for documents by topic
- documents = documents in this topic
- doc_scores = semantic similarity of document to topic
- doc_ids = Unique ids of documents. If ids were not given, the index of document in the original corpus.

In [19]:
# Get topics
topic_words, word_scores, topic_nums = model.get_topics(num_topics)
print(len(topic_words))
print(topic_words[22])

424
['employed' 'sunak' 'rishi' 'claimants' 'backdated' 'exchequer' 'hmrc'
 'chancellor' 'dwp' 'ssp' 'universal' 'scheme' 'incomes' 'freelancers'
 'pensions' 'employer' 'wages' 'penalised' 'abolished' 'jobseeker' 'wage'
 'casuals' 'sickness' 'claimant' 'pay' 'hardship' 'employment' 'renters'
 'redundant' 'ineligible' 'mcdonnell' 'eligible' 'generous' 'unemployed'
 'newstart' 'self' 'pmqs' 'eligibility' 'mps' 'manifesto' 'thinktank'
 'tories' 'pensioners' 'furlough' 'freelance' 'tory' 'enrolment' 'claim'
 'retention' 'disability']


In [35]:
# Search specific topic number
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=22, num_docs=5)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print(f"Document title: {df_top.loc[doc_id]['title']}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Document: 36663, Score: 0.7675349116325378
Document title: Self employed people: What can you claim from the government amid coronavirus lockdown?
-----------
-----------

Document: 24887, Score: 0.7663061618804932
Document title: Universal Credit: What does coronavirus mean for Universal Credit?
-----------
 Prime Minister Boris Johnson has advised UK citizens to work from home where possible due to the coronavirus outbreak, but what about those who are unable to do this? Many have found themselves out of work, or forced to self-isolate, leaving rent and bills impossible to pay. So, what does coronavirus mean for Universal Credit? Universal Credit is paid monthly to those who need it to help with living costs. If you are on a low income or unable to work, you may be entitled to Universal Credit. In light of the changes to our day-to-day lives due to coronavirus, Citizens Advice has announced it is calling for measures to ensure that vulnerable people and low-income families have enoug

### Semantic Search Documents by Keywords
Search documents for content semantically similar to given words

In [40]:
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=["digitalization", "strategy"], num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Document: 46778, Score: 0.5461277365684509
-----------
HONG KONG, Mar 31, 2020 - (ACN Newswire) -  - China Communications Services Corporation Limited ("China Comservice" or the "Company"), and its subsidiaries (the "Group") (HKSE code: 552), today announced its audited annual results for the year ended 31 December 2019. HIGHLIGHTS: - Driven by the businesses related to digital services in the domestic non-telecom operator market, the OPEX business and the industry-oriented smart application businesses in the domestic telecommunications operator market, the Group's overall results maintained steady growth, with business value further manifested - Domestic non-operator market became the largest customer group for the first time, the development quality of such market was enhanced and became increasingly significant to the profit growth of the Group - The Group built up its strength in 5G technologies and capabilities while proactively satisfying 5G network construction demand of the dom

### Similar word search

In [41]:
words, word_scores = model.similar_words(keywords=["stock"], keywords_neg=[], num_words=20)
for word, score in zip(words, word_scores):
    print(f"{word} {score}")

shares 0.7457133546083534
price 0.6392959463913955
trading 0.6349838424953756
outstanding 0.590467215322046
purchase 0.5795192426850122
share 0.5589785884506661
capitalization 0.5511721735341057
issuance 0.5359981618966084
equity 0.5346677554616316
common 0.5313826903701732
stocks 0.516772261299242
market 0.5161071533407926
investors 0.5134534870347027
repurchase 0.5098550224374023
traded 0.5086884339643466
reverse 0.5070340076614549
below 0.5067046325108504
selling 0.5000769239149925
sell 0.49458048772107904
preferred 0.49250113439350157


In [43]:
df_top.loc[31096]

author                                                      NaN
date                                        2020-03-20 00:00:00
domain                                           marketscreener
title                                 Dürr : Annual Report 2019
url           https://www.marketscreener.com/DURR-436002/new...
content       DRIVING CHANGE ANNUAL REPORT 2019 CONTENTS 2 T...
topic_area                                             business
Name: 31096, dtype: object