In [1]:
import pandas as pd
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types
import pickle

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from top2vec import Top2Vec

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
!which jupyter

/home/ubuntu/thesis_env2/bin/jupyter


In [3]:
df = pd.read_pickle('./data/df_processed_bigrams.pickle')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365200 entries, 0 to 369046
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181507 non-null  object        
 1   date               365200 non-null  datetime64[ns]
 2   domain             365200 non-null  object        
 3   title              365115 non-null  object        
 4   url                365200 non-null  object        
 5   content            365200 non-null  object        
 6   topic_area         365200 non-null  object        
 7   content_processed  365200 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 25.1+ MB


In [5]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...


In [6]:
# Note to do - need to add time element

def log_newline(self, how_many_lines=1):
    file_handler = None
    if self.handlers:
        file_handler = self.handlers[0]

    # Switch formatter, output a blank line
    file_handler.setFormatter(self.blank_formatter)
    for i in range(how_many_lines):
        self.info('')

    # Switch back
    file_handler.setFormatter(self.default_formatter)

def logger_w2v():
    
    log_file = os.path.join('./data', 'word2vec.log')
    print('log file location: ', log_file)
    
    log_format= '%(asctime)s - %(levelname)s - [%(module)s]\t%(message)s'
    formatter = logging.Formatter(fmt=(log_format))
    
    fhandler = logging.FileHandler(log_file)
    fhandler.setFormatter(formatter)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    logger.default_formatter = formatter
    logger.blank_formatter = logging.Formatter(fmt="")
    logger.newline = types.MethodType(log_newline, logger)
    
    return logger
    

In [7]:
def tokenise_dataset(df):

    tokens = df['content_processed'].str.split(" ")

    return tokens

# Top2Vec

In [None]:
find_topics = False
min_count = 1000 # ignore words with total frequency less than this
speed = 'deep-learn' # can try 'deep-learn' for possible better embeddings but will take longer
# started deep-lear at 8pm, still going at 2pm the next day

if find_topics:
    # import lemmatised data
    with open('data/data_lemmatized.pickle', 'rb') as f:
        data_lemmatized = pickle.load(f)
    
    data_lemmatized_str = [' '.join(article) for article in data_lemmatized]
    print(len(data_lemmatized))
    print(len(data_lemmatized_str))
    
    # Find topics
    # ~ 12.5 hours to run on lemmatised data
    #documents = df['content_processed'][:50000].values
    documents = data_lemmatized_str
    model = Top2Vec(documents, workers=4, min_count=min_count, speed=speed)
    model.save('top2vec_vocab_limit_deep.model')
else:
    #model = Top2Vec.load('top2vec.model')
    model = Top2Vec.load('top2vec_vocab_limit.model')
    #model = Top2Vec.load('top2vec_vocab_limit_deep.model')

print(len(model.topic_words))
print(model._get_word_vectors().shape)

In [None]:
print(model.topic_words[0], '\n')
#print(model.topic_words[1], '\n')
#print(model.topic_words[2], '\n')

### Get topic sizes

Number of documents most similar to each topic. Topics are in decreasing order of size.  
topic_sizes: The number of documents most similar to each topic.  
topic_nums: The unique index of every topic will be returned.  

In [None]:
topic_sizes, topic_ids = model.get_topic_sizes()
df_topic_sizes = pd.DataFrame(data=zip(topic_ids, topic_sizes), columns=['topic_id', 'num_docs'])

In [None]:
print(df_topic_sizes['num_docs'].sum())
df_topic_sizes

In [None]:
df_topic_sizes['num_docs'].plot()

### Get Topics
topic_words: For each topic the top 50 words are returned, in order of semantic similarity to topic.  
word_scores: For each topic the cosine similarity scores of the top 50 words to the topic are returned.  
topic_nums: The unique index of every topic will be returned.  

In [None]:
topic_words, word_scores, topic_ids = model.get_topics(model.get_num_topics())
topic_sizes, topic_ids = model.get_topic_sizes()
df_topics = pd.DataFrame(data=zip(topic_ids, topic_sizes, topic_words, word_scores), columns=['topic_id', 'topic_sizes', 'topic_words', 'word_scores'])

# add doc id's
df_topics['doc_ids'] = ''
for topic_idx, topic_size in enumerate(df_topics['topic_sizes']):
    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_idx, num_docs=topic_size)
    df_idx = df.iloc[document_ids].index.values
    df_topics['doc_ids'].at[topic_idx] = df_idx
print(df_topics['topic_sizes'].sum())

#df_topics.to_pickle("df_topics_top2vec_vocab_limit.pickle")

In [None]:
df_topics

### Search for topics than contain keywords
topic_words: For each topic the top 50 words are returned, in order of semantic similarity to topic.  
word_scores: For each topic the cosine similarity scores of the top 50 words to the topic are returned.  
topic_scores: For each topic the cosine similarity to the search keywords will be returned.  
topic_nums: The unique index of every topic will be returned.  

In [None]:
keywords = ["supply_chain"]
#keywords = ["digital_transformation"]
topic_words, word_scores, topic_scores, topic_ids = model.search_topics(keywords=keywords, num_topics=5)
df_topic_kw = pd.DataFrame(data=zip(topic_ids, topic_words, word_scores, topic_scores), columns=['topic_id', 'topic_words', 'word_scores', 'topic_scores'])

In [None]:
df_topic_kw

In [None]:
df_topic_kw['topic_words'][4]

In [None]:
df_topic_kw['topic_words'][1]

### Search articles by topic

After finding the relevant topic number can then search by this  
documents: The documents in a list, the most similar are first.  
doc_scores: Semantic similarity of document to topic. The cosine similarity of the document and topic vector.  
doc_ids: Unique ids of documents. If ids were not given, the index of document in the original corpus.  

In [None]:
df_topics[300:330]

In [None]:
# Get all document scores - note: doc index is iloc
doc_id_score = []
for topic_num in df_topics['topic_id']:
    num_docs = df_topics[df_topics['topic_id'] == topic_num]['topic_sizes'].values[0]
    _, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=num_docs)
    doc_id_score_temp = list(zip(document_ids, document_scores))
    doc_id_score = doc_id_score + doc_id_score_temp

print(len(doc_id_score))

In [None]:
df_doc_id_score = pd.DataFrame(doc_id_score, columns=['doc_idx', 'doc_score']).sort_values(by='doc_idx')
df_doc_id_score = df_doc_id_score.set_index('doc_idx')
df_doc_id_score

In [None]:
topic_num = 1
num_docs = df_topics[df_topics['topic_id'] == topic_num]['topic_sizes'].values[0]
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=num_docs)

result_df = df.iloc[document_ids]
result_df["document_scores"] = document_scores

result_df[:50]

# for index,row in result_df.iterrows():
#     print(f"Document: {index}, Score: {row.document_scores}")
#     print(f"Title: {row.title}")
#     print("-----------")
#     #print(row.content)
#     #print("-----------")

### Search articles by Keywords

In [None]:
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=["supply_chain", "disrupt"], num_docs=2)
result_df = df.iloc[document_ids]
result_df["document_scores"] = document_scores

# for index,row in result_df.iterrows():
#     print(f"Document: {index}, Score: {row.document_scores}")
#     print(f"Title: {row.title}")
#     print("-----------")
#     print(row.content)
#     print("-----------")
#     print()

### Find Similar Words

In [None]:
# Get words in vocab
vocab_length = len(model._get_word_vectors())
print(vocab_length)

vocab = []
for n in range(vocab_length):
    vocab.append(model._index2word(n))

In [None]:
[x for x in vocab if 'digital' in x]

In [None]:
print(f'vocabulary length: {len(model._get_word_vectors())}')

words_model, word_scores = model.similar_words(keywords=["supply_chain"], num_words=20)
for word, score in zip(words_model, word_scores):
    print(f"{word} {score}")

In [None]:
#model._words2word_vectors(['supply'])
model._get_word_vectors() # word embeddings
model._index2word(1)

## Label Topics

In [None]:
df_topics_labelled = df_topics.copy()
df_topics_labelled['topic_label'] = ''

In [None]:
df_topics_labelled.head(1)

In [None]:
#df_topics_labelled.loc[1457]['topic_label']

In [None]:
idx = 1772
topic_label = ['company_specific', 'Aurinia_Pharmaceuticals']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1771
topic_label = ['company_specific', 'Dare_Bioscience']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1770
topic_label = ['gift_cards']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1769
topic_label = ['stock_market_rally']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1768
topic_label = ['asian_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1767
topic_label = ['investor_letter_released']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1766
topic_label = ['k_pop', 'BTS', 'IPO']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1765
topic_label = ['new_product_launches']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1764
topic_label = ['company_specific', 'Kodak']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1763
topic_label = ['company_specific', 'Blackbaud']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1762
topic_label = ['us_stocks_down']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1761
topic_label = ['company_specific', 'Victory_Square_Technologies']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1760
topic_label = ['computer_games']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1759
topic_label = ['elon_musk', 'covid_19_positive']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1758
topic_label = ['mexico']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1757
topic_label = ['general_industry_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1756
topic_label = ['company_specific', 'Co-Diagnostics']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1755
topic_label = ['company_specific', 'Costco']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1754
topic_label = ['delhi_court']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1753
topic_label = ['france', 'stimulus']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1752
topic_label = ['company_specific', 'samsung', 'leader_in_court']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1751
topic_label = ['us_stocks_up']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1750
topic_label = ['company_specific', 'TOMI Environmental Solutions', 'disinfectant']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1749
topic_label = ['company_specific', 'Baudax Bio']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1748
topic_label = ['company_specific', 'Gran Colombia']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1747
topic_label = ['company_specific', 'Aytu BioScience']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1746
topic_label = ['us_stocks_up']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1745
topic_label = ['celebrities', 'rich']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1744
topic_label = ['china', 'swine_flu']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1743
topic_label = ['company_specific', 'Liminal BioSciences']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1742
topic_label = ['company_specific', 'Microsoft', 'closing_retail_stores']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1741
topic_label = ['company_specific', 'Mesoblast Limited']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1740
topic_label = ['sri_lanka', 'elections']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1739
topic_label = ['bryan_cranston', 'covid_19_positive']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1738
topic_label = ['general_industry_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1737
topic_label = ['south_korea', 'elections']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1736
topic_label = ['company_specific', 'Halo']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1735
topic_label = ['dwayne_johnson', 'football']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1734
topic_label = ['medical_trials']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1733
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1732
topic_label = ['investor_letter_released']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1731
topic_label = ['company_specific', 'Qiagen']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1730
topic_label = ['company_specific', 'Rollins']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1729
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1728
topic_label = ['football', 'newcastle']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1727
topic_label = ['general_industry_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1726
topic_label = ['dividend_notices']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1725
topic_label = ['ignore', 'foreign_language']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1724
topic_label = ['company_specific', 'Tonix_Pharmaceuticals']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1723
topic_label = ['company_specific', 'Tesla', 'Curevac']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1722
topic_label = ['company_specific', 'Mateon_Therapeutics']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1721
topic_label = ['company_specific', 'Sernova']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1720
topic_label = ['company_specific', 'JDE_Peet', 'coffee']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1719
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1718
topic_label = ['santa_barbara', 'covid_19']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1717
topic_label = ['ignore', 'foreign_language']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1716
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1715
topic_label = ['gamesbeat_summit']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1714
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1713
topic_label = ['company_specific', 'Akoustis']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1712
topic_label = ['company_specific', 'Pandora']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1711
topic_label = ['covid_19']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1710
topic_label = ['company_specific', 'Synairgen', 'stocks_soar', 'covid_19', 'effective_drug']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1709
topic_label = ['nascar']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1708
topic_label = ['general_trust_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1707
topic_label = ['us_stocks_down']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1706
topic_label = ['stocks_up', 'eu_recovery_fund']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1705
topic_label = ['ethanol_companies']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1704
topic_label = ['company_specific', 'Corning_Incorporated']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1703
topic_label = ['company_specific', 'GTX', 'SuperCom']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1702
topic_label = ['italy', 'genoa_new_bridge']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1701
topic_label = ['india', 'drug_approval', 'Itolizumab', 'covid_19']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1700
topic_label = ['company_specific', 'Amazon', 'new_product_launches', 'home_services']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

In [None]:
idx = 1699
topic_label = ['south_korea', 'mayor_dead']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1698
topic_label = ['canada', 'healthcare_companies', 'company_takeovers']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1697
topic_label = ['general_industry_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1696
topic_label = ['pharmaceutical_companies', 'disinfectants_approved', 'covid_19']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1695
topic_label = ['ignore', 'foreign_language']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1694
topic_label = ['coffee_companies', 'financial_irregularities']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1693
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1692
topic_label = ['us_shares_down']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1691
topic_label = ['company_specific', 'Immunomedics']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1690
topic_label = ['us_shares_up', 'vaccine_hopes']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1689
topic_label = ['company_specific', 'Catalyst_Biosciences']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1688
topic_label = ['company_specific', 'Sinovac_Biotech']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1687
topic_label = ['company_specific', 'EasyJet', 'director_disagreements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1686
topic_label = ['company_specific', 'Eli_Lilly', 'covid_19', 'treatments']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1685
topic_label = ['company_specific', 'twitter', 'half_year_update']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1684
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1683
topic_label = ['general_industry_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1682
topic_label = ['us_shares_up', 'vaccine_hopes']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1681
topic_label = ['covid_19', 'outbreak', 'maine']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1680
topic_label = ['company_specific', 'twitter', 'ceo']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1679
topic_label = ['covid_19', 'human_seismic_action_reduced',]
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1678
topic_label = ['company_specific', 'AMC', 'bondholder_deal']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1677
topic_label = ['company_specific', 'Amgen']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1676
topic_label = ['company_specific', 'NCL_Corporation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1675
topic_label = ['china', 'Kazakhstan', 'pneumonia']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1674
topic_label = ['company_specific', 'RedHill_Biopharma']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1673
topic_label = ['china', 'technology_companies', 'american_depositary_shares_offering']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1672
topic_label = ['company_specific', 'Valneva']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1671
topic_label = ['general_stock_updates_positive']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1670
topic_label = ['company_specific', 'Lufthansa', 'government_bailout']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1669
topic_label = ['company_specific', 'Citigroup', 'Revlon', 'mistaken_payment']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1668
topic_label = ['company_specific', 'Era_Group', 'Nokia']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1667
topic_label = ['melania_trump', 'white_house_rose_garden']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1666
topic_label = ['general_stock_updates', 'gold_up',]
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1665
topic_label = ['company_specific', 'Tata_Consultancy_Services',]
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1664
topic_label = ['company_specific', 'Warner_Music', 'ipo']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1663
topic_label = ['company_specific', 'Centogene']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1662
topic_label = ['company_specific', 'Pfizer', 'BioNTech', 'vaccine_documents_hacked']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1661
topic_label = ['company_specific', 'WWE']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1660
topic_label = ['company_specific', 'amd', 'nvidia', 'microprocessor']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1659
topic_label = ['global_tech_show_ces', 'online_only']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1658
topic_label = ['kanye_west', 'presidential_bid']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1657
topic_label = ['credit_cards', 'digital_payments']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1656
topic_label = ['company_specific', 'air_asia', 'trading_halt', 'auditor_concerns']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1655
topic_label = ['soccer_womens']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1654
topic_label = ['general_stock_updates',]
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1653
topic_label = ['company_specific', 'SMT_Scharf']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1652
topic_label = ['ukraine', 'central_bank']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1651
topic_label = ['italy', 'prime_minister', 'covid_19_positive']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1650
topic_label = ['australia', 'journalists_evacuated_from_china']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1649
topic_label = ['japan', 'covid_19', 'economic_concerns']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1648
topic_label = ['company_specific', 'Majesco']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1647
topic_label = ['company_specific', 'LG_Chem', 'chemical_leak']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1646
topic_label = ['sharon_stone', 'sister_fighting_for_life', 'covid_19']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1645
topic_label = ['who', 'us_withdrawal']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1644
topic_label = ['general_britain_france_german_relations']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1643
topic_label = ['us_retail', 'employee_bonus', 'employee_wage_increase']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1642
topic_label = ['pharamceutical_companies', 'drug_trial_submissions']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1641
topic_label = ['company_specific', 'Marathon_Gold']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1640
topic_label = ['royalty_companies', 'general_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1639
topic_label = ['company_specific', 'Churchill_Downs_Incorporated']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1638
topic_label = ['uganda', 'bobi_wine', 'new_political_party']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1637
topic_label = ['company_specific', 'Regeneron_Pharmaceuticals']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1636
topic_label = ['company_specific', 'Alliance_Data']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1635
topic_label = ['covid_19', 'early_concerns_growing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1634
topic_label = ['canada', 'nova_scotia', 'mass_shooting']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1633
topic_label = ['company_specific', 'Hydro_One']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1632
topic_label = ['canadian_pharmaceutical_companies', 'organ_failure_drugs', 'covid_19_treatment']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1631
topic_label = ['ruth_baber_ginsburg', 'cancer']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1630
topic_label = ['white_house_covid_exposure']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1629
topic_label = ['white_house_covid_exposure']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1628
topic_label = ['company_specific', 'Shell']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1627
topic_label = ['sport', 'EFL']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1626
topic_label = ['india', 'kasmir', 'coronavirus_restrictions_politically_motivated']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1625
topic_label = ['company_specific', 'Tauriga_Sciences']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1624
topic_label = ['company_specific', 'airbnb', 'house_parties_banned']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1623
topic_label = ['company_specific', 'T_Mobile']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1622
topic_label = ['company_specific', 'Happiness_Biotech']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1621
topic_label = ['us_shares_down']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1620
topic_label = ['Alexandria_Ocasio-Cortez']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1619
topic_label = ['company_specific', 'Accenture']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1618
topic_label = ['cpi_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1617
topic_label = ['company_specific', 'Ascendis_Pharma']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1616
topic_label = ['covid_19', 'holiday_spread_warnings']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1615
topic_label = ['company_specific', 'CHF_Solutions']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1614
topic_label = ['us_cares_act_funding']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1613
topic_label = ['company_specific', 'Gilead_Sciences', 'covid_19', 'remdesivir_reduced_deaths']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1612
topic_label = ['english_royalty']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1611
topic_label = ['company_specific', 'Dynavax']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1610
topic_label = ['auto_companies', 'general_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1609
topic_label = ['ignore', 'foreign_language']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1608
topic_label = ['covid_19', 'vaccine_development']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1607
topic_label = ['company_specific', 'Alpha_Bank']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1606
topic_label = ['company_specific', 'GM', 'legal_fight_fiat_chrysler']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1605
topic_label = ['company_specific', 'Universal_Display_Corporation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1604
topic_label = ['general_stock_updates',]
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1603
topic_label = ['ignore', 'foreign_language']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1602
topic_label = ['hackers_targeting_vaccine_developers']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1601
topic_label = ['australia', 'ex_prime_minister_appointed_trade_minister']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1600
topic_label = ['music', 'oasis']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

In [None]:
idx = 100
topic_label = ['russia', 'belarus', 'putin']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 99
topic_label = ['sport']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 98
topic_label = ['scotland', 'snp', 'independence']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 97
topic_label = ['banks', 'result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 96
topic_label = ['canabis_companies', 'result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 95
topic_label = ['sport', 'golf', 'tennis']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 94
topic_label = ['result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 93
topic_label = ['mckinsey_updates', 'digitalisation', 'ai', 'agile', 'transformation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 92
topic_label = ['banks']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 91
topic_label = ['technavio_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 90
topic_label = ['sport', 'ncaa']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 89
topic_label = ['clothing_retailers', 'store_closures']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 88
topic_label = ['company_covid_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 87
topic_label = ['covid_tests']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 86
topic_label = ['recipes']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 85
topic_label = ['us_election']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 84
topic_label = ['meat_plants']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 83
topic_label = ['result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 82
topic_label = ['wildlife', 'extinction', 'conservation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 81
topic_label = ['uk_covid_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 80
topic_label = ['schools', 'covid', 'remote_learning']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 79
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 78
topic_label = ['art']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 77
topic_label = ['retailers', 'bankruptcy']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 76
topic_label = ['gambling_companies']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 75
topic_label = ['uk_football']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 74
topic_label = ['airlines', 'redundancies']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 73
topic_label = ['donations']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 72
topic_label = ['football']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 71
topic_label = ['internet']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 70
topic_label = ['renewable_energy']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 69
topic_label = ['country_covid_strategies']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 68
topic_label = ['retailers', 'online_sales']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 67
topic_label = ['fintech']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 66
topic_label = ['etf']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 65
topic_label = ['cricket']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 64
topic_label = ['celebrities', 'marriage', 'children']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 63
topic_label = ['indian_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 62
topic_label = ['baseball']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 61
topic_label = ['us', 'stimulus_funding']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 60
topic_label = ['crunchbase_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 59
topic_label = ['schools', 'covid']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 58
topic_label = ['football', 'covid']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 57
topic_label = ['football', 'man_utd']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 56
topic_label = ['fast_food']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 55
topic_label = ['us', 'small_business', 'emergency_covid_funding']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 54
topic_label = ['covid', 'deaths', 'personal_tragedy']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 53
topic_label = ['brazil', 'covid']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 52
topic_label = ['company_annual_meetings']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 51
topic_label = ['tv_shows']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 50
topic_label = ['movie_delays']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 49
topic_label = ['resource_companies', 'result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 48
topic_label = ['covid_symptons']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 47
topic_label = ['us_fed_interest_rate_decisions']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 46
topic_label = ['pharmaceutical_companies', 'cancer_treatments']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 45
topic_label = ['employees', 'wellbeing', 'remote_working']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 44
topic_label = ['us', 'covid', 'high_case_numbers']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 43
topic_label = ['us', 'retirement_planning']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 42
topic_label = ['car_racing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 41
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 40
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 39
topic_label = ['supermarkets']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 38
topic_label = ['gold_companies']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 37
topic_label = ['airlines', 'boeing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 36
topic_label = ['nfl']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 35
topic_label = ['covid', 'personal_connections']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 34
topic_label = ['airlines']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 33
topic_label = ['us_election', 'mail_voting']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 32
topic_label = ['dgap_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 31
topic_label = ['pharmaceutical_companies', 'financial_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 30
topic_label = ['climate_change', 'carbon_emissions']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 29
topic_label = ['us', 'covid_stimulus']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 28
topic_label = ['luxury_fashion_retailers', 'covid_stimulus']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 27
topic_label = ['strictly_come_dancing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 26
topic_label = ['eu', 'covid_recovery_fund']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 25
topic_label = ['uk_royalty']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 24
topic_label = ['result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 23
topic_label = ['result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 22
topic_label = ['football', 'transfers']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 21
topic_label = ['gold_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 20
topic_label = ['technavio_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 19
topic_label = ['general_industry_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 18
topic_label = ['india', 'covid_cases']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 17
topic_label = ['brexit']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 16
topic_label = ['us', 'unemployment']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 15
topic_label = ['cloud']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 14
topic_label = ['music']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 13
topic_label = ['australia', 'covid']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 12
topic_label = ['manufacturing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 11
topic_label = ['holidays']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 10
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 9
topic_label = ['mental_health']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 8
topic_label = ['mental_health']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 7
topic_label = ['uk', 'lockdowns']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 6
topic_label = ['uk', 'tv_shows']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 5
topic_label = ['us_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 4
topic_label = ['hedge_funds']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 3
topic_label = ['covid', 'vaccines']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 2
topic_label = ['result_announcements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1
topic_label = ['nytimes_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 0
topic_label = ['oil_price_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

In [None]:

idx = 635 
topic_label = ['smartphone', 'semiconductor', 'supply_chain', 'automaker']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 820 
topic_label = ['clothing', 'manufacturing', 'supply_chain', 'exploitation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 859 
topic_label = ['pharma', 'shortage', 'supply_chain', 'manufacturing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 914 
topic_label = ['clothing', 'manufacturing', 'supply_chain', 'exploitation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1132 
topic_label = ['ppe', 'ventilator', 'supply_chain', 'shortage']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1314
topic_label = ['semiconductor', 'tech_company', 'china', 'supply_chain']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1457
topic_label = ['cybersecurity', 'space', 'aluminum', 'supply_chain']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1481
topic_label = ['shipping', 'logistics', 'supply_chain']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 256
topic_label = ['employees', 'amazon', 'warehouse', 'unsafe']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 368
topic_label = ['australian_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

idx = 242
topic_label = ['amd', 'nvidia', 'microprocessor']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 434
topic_label = ['us_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label


idx = 317
topic_label = ['airlines', 'covid_19', 'safety_measures']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

idx = 458
topic_label = ['airlines', 'cost_cutting', 'reduced_flights']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 573
topic_label = ['airlines', 'flights_suspended']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 551
topic_label = ['cruises', 'passengers', 'covid_19']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 732
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 812
topic_label = ['us_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 460
topic_label = ['us_shares']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

In [None]:
idx = 134
topic_label = ['employees', 'company_statements', 'covid_response']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 112
topic_label = ['general_stock_updates']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 103
topic_label = ['cruises']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

idx = 200
topic_label = ['currency_movements']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 199
topic_label = ['cycling']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 198
topic_label = ['uk', 'covid_testing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 197
topic_label = ['annual_general_meeting']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 196
topic_label = ['asia', 'covid']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 195
topic_label = ['hurricanes']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 194
topic_label = ['covid', 'hospitality', 'open_close_rules']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

In [None]:
topic_num = 193
num_docs = df_topics[df_topics['topic_id'] == topic_num]['topic_sizes'].values[0]
print(num_docs)
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=num_docs)

result_df = df.iloc[document_ids].copy()
result_df["document_scores"] = document_scores
result_df = result_df[result_df["document_scores"] >=0.5]
print(len(result_df))

result_df[:50][['date', 'domain', 'title', 'content', 'topic_area', 'url', 'document_scores']]

In [228]:
result_df.iloc[400:420][['date', 'domain', 'title', 'content', 'topic_area', 'url', 'document_scores']]

Unnamed: 0,date,domain,title,content,topic_area,url,document_scores


In [233]:
idx = 312439
print(result_df.loc[idx]['title'])
result_df.loc[idx]['content']

Angela Rayner rebukes Boris Johnson over 'collapsing' Covid test system


'Angela Rayner has tackled Boris Johnson over a “collapsing” Covid test regime and a wider crisis in the care sector, using a prime minister’s questions where she stood in for Keir Starmer to accuse Johnson of refusing to take responsibility for his failings. In her first question, the Labour deputy leader, who was a care worker before becoming an MP, challenged Johnson to cite the average hourly wage for care staff, a request the prime minister ignored. Rayner also used the example of Starmer, who was forced to self-isolate after one of his children needed testing for Covid-19, to condemn the government over current delays many people face in getting a test. Rayner said, to laughter from other Labour MPs, that she had received a message “from a man called Keir”. She went on: “Keir wasn’t able to go to work today, and his children couldn’t go to school, because his family had to wait for their coronavirus test results, despite the prime minister’s promise of results within 24 hours. “K

In [None]:
#df_topics_labelled[df_topics_labelled['topic_label'] != ""]

In [236]:
topic_id = 197
print(df_topics_labelled[df_topics_labelled['topic_id'] == topic_id]['topic_sizes'])
print(df_topics_labelled[df_topics_labelled['topic_id'] == topic_id]['topic_words'].values)

197    419
Name: topic_sizes, dtype: int64
[array(['agm', 'euronext', 'egm', 'isin', 'duly', 'sa_publishe',
        'euroclear', 'resolution', 'convene', 'decree', 'meeting', 'proxy',
        'registrar', 'board_director', 'shareholder', 'explanatory', 'fax',
        'remuneration', 'postal', 'intermediary', 'auditor', 'helsinki',
        'authorise', 'finnish', 'circular', 'agenda', 'halfyear', 'marche',
        'appoint', 'email_address', 'tel', 'submit', 'paris', 'bulletin',
        'authorisation', 'groupe', 'valid', 'ordinance', 'spa', 'voting',
        'electronically', 'notice', 'ordinary', 'general', 'proxy_card',
        'stockholm', 'loreal', 'enquiry', 'disclaimer', 'omx'],
       dtype='<U15')                                                       ]


In [213]:
df_topics_labelled.to_pickle("./data/df_topics_labelled_vocab_limit")

## Apply topic labels to document df

In [214]:
df_temp = df_topics_labelled.copy()
df_with_topics = df.copy()
df_with_topics['topic_id'] = ''
df_with_topics['topic_label'] = ''
df_with_topics['topic_score'] = ''
col_num_id = df_with_topics.columns.get_loc('topic_id')
col_num_label = df_with_topics.columns.get_loc('topic_label')
col_num_score = df_with_topics.columns.get_loc('topic_score')

for topic_id, doc_ids, topic_label in zip(df_temp['topic_id'], df_temp['doc_ids'], df_temp['topic_label']):
    #print(topic_id, doc_ids, topic_label)
    for idx in doc_ids:
        #print(idx, topic_label)
        df_with_topics.at[idx, 'topic_id'] = topic_id
        df_with_topics.at[idx, 'topic_label'] = topic_label
        
df_with_topics['topic_score'] = df_doc_id_score['doc_score'].values
df_with_topics['topic_score_rating'] = df_with_topics['topic_score'].apply(lambda x: np.where(x < 0.5, "low", "high"))

df_with_topics.to_pickle("./data/df_with_topics")

In [215]:
df_with_topics.head(1).append(df_with_topics.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,topic_id,topic_label,topic_score,topic_score_rating
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,970,,0.437342,low
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...,10,[general_stock_updates],0.588543,high


In [216]:
#df_with_topics[df_with_topics['topic_id'] != '']

In [217]:
df_with_topics[df_with_topics['topic_label'] != '']

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,topic_id,topic_label,topic_score,topic_score_rating
4,Alden Wicker,2020-01-06,instyle,Red Carpet Sustainability After Coronavirus Sh...,https://www.instyle.com/fashion/red-carpet-cor...,When the coronavirus pandemic is over and life...,consumer,coronavirus pandemic life returns normal celeb...,28,"[luxury_fashion_retailers, covid_stimulus]",0.450103,low
18,By Sui-Lee Wee and Donald G. McNeil Jr.,2020-01-09,nytimes,China Identifies New Virus Causing Pneumoniali...,https://www.nytimes.com/2020/01/08/health/chin...,HONG KONG — Chinese researchers say they have ...,business,hong_kong chinese researchers identified new v...,1,[nytimes_updates],0.553961,high
24,By Amy Qin and Javier C. Hernández,2020-01-11,nytimes,China Reports First Death From New Virus,https://www.nytimes.com/2020/01/10/world/asia/...,HONG KONG — Chinese state media on Saturday re...,business,hong_kong chinese state media saturday reporte...,1,[nytimes_updates],0.523607,high
27,Thomas Hughes,2020-01-13,marketbeat,"Lululemon Growth Strategy Gains Traction, Guid...",https://www.marketbeat.com/originals/lululemon...,Lululemon (LULU) just raised its 4th quarter g...,business,lululemon lulu raised 4th quarter guidance sha...,89,"[clothing_retailers, store_closures]",0.551399,high
29,Chris Markoch,2020-01-13,marketbeat,How to Approach Bank Stocks as Earnings Season...,https://www.marketbeat.com/originals/how-to-ap...,Some of the nation’s largest banks such as JP ...,business,nation largest banks jp_morgan chase nysejpm b...,92,[banks],0.449021,low
...,...,...,...,...,...,...,...,...,...,...,...,...
369040,By Rachel Fakhry,2020-12-31,greenbiz,"If we want hydrogen to live up to its promise,...",https://www.greenbiz.com/article/if-we-want-hy...,This year saw a slew of global hydrogen announ...,environment,year saw slew global hydrogen announcements ea...,30,"[climate_change, carbon_emissions]",0.537654,high
369041,Gina Clarke,2020-12-31,thefintechtimes,Behind the Idea: Kidbrooke,https://thefintechtimes.com/behind-the-idea-ki...,After nearly 20 years of an international care...,finance,nearly years international career insurance ca...,67,[fintech],0.642278,high
369042,Polly Harrison,2020-12-31,thefintechtimes,A Human Touch Will Be a Competitive Edge After...,https://thefintechtimes.com/53867-2/,Niels Pedersen is a Chartered Accountant and S...,finance,niels pedersen chartered accountant senior lec...,67,[fintech],0.588367,high
369044,Polly Harrison,2020-12-31,thefintechtimes,"US Payments: Smart Pension, Episode Six, PAAY ...",https://thefintechtimes.com/us-payments-smart-...,"This December, The Fintech Times is asking ind...",finance,december fintech_times asking industry leaders...,67,[fintech],0.617454,high


In [219]:
df_topics_labelled[df_topics_labelled['topic_id'] == 1699]['topic_words'].values

array([array(['seoul', 'north_korean', 'kim', 'jongun', 'korean', 'korea',
              'harassment', 'south_korean', 'south_korea', 'sexual', 'allegedly',
              'moon', 'prosecutor', 'victim', 'arrest', 'rape', 'church',
              'allegation', 'assault', 'misconduct', 'police', 'gay', 'lgbtq',
              'abuse', 'funeral', 'convict', 'civil_right', 'maxwell',
              'suspicion', 'raid', 'corruption', 'metropolitan', 'probe',
              'incident', 'unnamed', 'sex', 'man', 'discrimination',
              'accusation', 'suspect', 'dead', 'activist', 'elect',
              'antigovernment', 'successor', 'allege', 'scandal', 'equality',
              'lee', 'anonymous'], dtype='<U15')                                 ],
      dtype=object)

In [220]:
df_with_topics.loc[334181]

author                                                Charlotte Manning
date                                                2020-10-24 00:00:00
domain                                                          express
title                 Gethin Jones: Naga Munchetty 'puts money' on p...
url                   https://www.express.co.uk/showbiz/tv-radio/135...
content                We will use your email address only for sendi...
topic_area                                                      general
content_processed     use email_address sending_newsletters privacy_...
topic_id                                                              6
topic_label                                              [uk, tv_shows]
topic_score                                                    0.741322
topic_score_rating                                                 high
Name: 334181, dtype: object