# Topic Modelling with Defined Topics

### Imports

In [1]:
import pandas as pd
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types
import pickle
from tqdm import tqdm
from scipy import spatial
import matplotlib.pyplot as plt
from top2vec import Top2Vec

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

pd.set_option('display.max_columns', 100)

## Calculate Average Document Embedding
- Based on embeddings trained during Top2Vec Deep Learn Model

In [2]:
# Load Top2Vec model to extract word embeddings

model = Top2Vec.load('./data/topics/top2vec_vocab_limit_deep.model')

print(model._get_word_vectors().shape)

# Get words in vocab
vocab_length = len(model._get_word_vectors())
print(vocab_length)

vocab = []
for n in range(vocab_length):
    vocab.append(model._index2word(n))
    
vectors = model._get_word_vectors()

def document_vector(doc, vocab):
    """
    :calculate average document embedding
    """
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in vocab]
    doc_idx = np.where(np.isin(vocab, doc))[0]
    doc_vecs = vectors[doc_idx]
    return np.mean(doc_vecs, axis=0)

calculate_doc_embeddings = False

if calculate_doc_embeddings:
    # takes ~ 1.5 hours
    df = pd.read_pickle('./data/bigrams/df_processed_bigrams_top2vec_trg.pickle')

    with open('data/topics/data_lemmatized.pickle', 'rb') as f:
            data_lemmatized = pickle.load(f)

    data_lemmatized_str = [' '.join(article) for article in data_lemmatized]
    print(len(data_lemmatized))
    print(len(data_lemmatized_str))

    df['content_lemma'] = data_lemmatized_str
    df['doc_embedding'] = df['content_lemma'].apply(document_vector, args=(vocab,))
    df.to_pickle('df_doc_embeddings.pickle')
else:
    df = pd.read_pickle('df_doc_embeddings.pickle')
    print(len(df))

(9453, 300)
9453
365200


In [3]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,content_lemma,doc_embedding,blockchain,digitization,machine_learne,cloud,iot,store_closure,delivery,redundancy,costcutte,flight,supply_chain,shutdown,outsourcing,workfromhome,diversification
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,end year corner past time think positioning fo...,"[-0.10415985, 0.034574475, -0.018250903, 0.040...",0.64344,0.492892,0.579234,0.571812,0.521615,0.453935,0.221586,0.588197,0.209348,0.51657,0.274569,0.142759,0.503211,0.464359,0.674917
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...,ftse lose consumer stock mainly unilever briti...,"[-0.10471698, 0.026819536, -0.028612636, 0.039...",0.655437,0.516977,0.626987,0.581966,0.547284,0.446046,0.251786,0.564917,0.211311,0.494346,0.280798,0.140724,0.516566,0.492329,0.69103


## Define Topics and Calculate Average Topic Embeddings

In [4]:
def topic_vector(keyword, vocab, n_words):
    words_model, word_scores = model.similar_words(keywords=[keyword], num_words=n_words)
    words_model = np.append(keyword, words_model)
    #print("'" + keyword + "'" + ',')
    #print(keyword, ',')
    print(keyword, words_model, '\n')
    words_idx = np.where(np.isin(vocab, words_model))[0]
    words_vecs = vectors[words_idx]
    return np.mean(words_vecs, axis=0)

def topic_vector_defined_words(keyword, vocab, vectors, words):
    print(keyword, words, '\n')
    words_idx = np.where(np.isin(vocab, words))[0]
    words_vecs = vectors[words_idx]
    return np.mean(words_vecs, axis=0)

In [6]:
#Technology
blockchain = topic_vector("blockchain", vocab, 5)
digitization = topic_vector("digitization", vocab, 5)
machine_learne = topic_vector("machine_learne", vocab, 5)
cloud = topic_vector("cloud", vocab, 5)
#product_launched = topic_vector("product_launche", vocab, 10)
iot = topic_vector("iot", vocab, 5)

#Retail
store_closure = topic_vector("store_closure", vocab, 5)
delivery = topic_vector("delivery", vocab, 10)

#Airlines
redundancy = topic_vector("redundancy", vocab, 5)
costcutte = topic_vector("costcutte", vocab, 5)
flight = topic_vector("flight", vocab, 5)
airlines_costs = redundancy + costcutte + flight

#Other
supply_chain = topic_vector("supply_chain", vocab, 10)
shutdown = topic_vector("shutdown", vocab, 5)
outsourcing = topic_vector("outsourcing", vocab, 2)
workfromhome = topic_vector("workfromhome", vocab, 4)
diversification = topic_vector("diversification", vocab, 3)

# Healthcare / Pharmaceuticals / Security
vaccines = ['vaccine', 'inoculate', 'vaccinate', 'dose', 'vaccination', 'vaccinated', 'shot', 'immunization', 'jab']
drug_discovery = ['discover', 'biology', 'therapeutic', 'advancement', 'molecule', 'development']
tele_health = ['telehealth', 'telemedicine', 'teladoc', 'clinician', 'healthcare', 'behavioral', 'practitioner', 'physician', 'triage', 
               'clinic', 'medicare']
vaccines = topic_vector_defined_words('vaccines', vocab, vectors, vaccines)
drug_discovery = topic_vector_defined_words('drug_discovery', vocab, vectors, drug_discovery)
tele_health = topic_vector_defined_words('tele_health', vocab, vectors, tele_health)
cybersecurity = topic_vector("cybersecurity", vocab, 20)

blockchain ['blockchain' 'crypto' 'cryptocurrency' 'token' 'bitcoin' 'wallet'] 

digitization ['digitization' 'digitalization' 'digital' 'automation' 'agility'
 'efficiency'] 

machine_learne ['machine_learne' 'ai' 'algorithm' 'interface' 'dataset' 'augment'] 

cloud ['cloud' 'cloudbase' 'azure' 'cloud_compute' 'onpremise' 'enterprise'] 

iot ['iot' 'connectivity' 'solution' 'intelligent' 'smart' 'cloudbase'] 

store_closure ['store_closure' 'store' 'retailer' 'brickandmortar' 'retail'
 'foot_traffic'] 

delivery ['delivery' 'ordering' 'order' 'deliver' 'grocery' 'logistic' 'courier'
 'pickup' 'service' 'fulfill' 'expand'] 

redundancy ['redundancy' 'redundant' 'job_retention' 'furlough' 'furlough_scheme'
 'scheme'] 

costcutte ['costcutte' 'cutting' 'cut' 'drastic' 'curtail' 'layoff'] 

flight ['flight' 'fly' 'passenger' 'airline' 'airport' 'flying'] 

supply_chain ['supply_chain' 'supplychain' 'disruption' 'supplier' 'manufacturer'
 'disrupt' 'manufacturing' 'shortage' 'supply' 'glob

In [9]:
words_model, word_scores = model.similar_words(keywords=["telehealth"], num_words=20)
for word, score in zip(words_model, word_scores):
    print(f"{word} {score}")
    #print(f"'{word}', ")

telemedicine 0.87737070112534
teladoc 0.6849135402845238
clinician 0.5997958902959599
healthcare 0.5892989851988948
behavioral 0.5884770636534274
practitioner 0.5843353457398499
physician 0.5744802984248178
triage 0.5706131409238088
clinic 0.5465864319097209
medicare 0.5463951644336746
outpatient 0.5329209670421382
provider 0.5323178826332496
care 0.5317680338581093
prescription 0.5097243723162554
referral 0.4887712792659804
remote 0.48785862897003124
consultation 0.4850182010621896
payer 0.48497656092976105
medication 0.4823031674451979
doctor 0.4805943963483572


In [10]:
calculate_doc_topic_distance = False

if calculate_doc_topic_distance:

    topics_str = ['blockchain','digitization','machine_learne','cloud','iot','store_closure','delivery','redundancy','costcutte','flight','supply_chain','shutdown','outsourcing','workfromhome','diversification',]
    topics_var = [blockchain ,digitization ,machine_learne ,cloud ,iot ,store_closure ,delivery ,redundancy ,costcutte ,flight ,supply_chain ,shutdown ,outsourcing ,workfromhome ,diversification]
    topics = dict(zip(topics_str, topics_var))
    for key, value in topics.items():
        df[key] = df['doc_embedding'].apply(lambda x: spatial.distance.cosine(x, value))

    df.to_pickle('df_doc_embeddings.pickle')
    
#else:
#    df = pd.read_pickle('df_doc_embeddings.pickle')
    
calculate_doc_topic_distance_addition = False

#------additional terms--------#
if calculate_doc_topic_distance_addition:
    
    topics_str = ['vaccines', 'drug_discovery', 'tele_health', 'cybersecurity']
    topics_var = [vaccines, drug_discovery, tele_health, cybersecurity]
    topics = dict(zip(topics_str, topics_var))
    df = pd.read_pickle('df_doc_embeddings.pickle')
    
    for key, value in topics.items():
        df[key] = df['doc_embedding'].apply(lambda x: spatial.distance.cosine(x, value))
    
    df.to_pickle('df_doc_embeddings_additional.pickle')
    
else:
    df = pd.read_pickle('df_doc_embeddings_additional.pickle')

In [11]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,content_lemma,doc_embedding,blockchain,digitization,machine_learne,cloud,iot,store_closure,delivery,redundancy,costcutte,flight,supply_chain,shutdown,outsourcing,workfromhome,diversification,vaccines,drug_discovery,tele_health,cybersecurity
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,end year corner past time think positioning fo...,"[-0.10415985, 0.034574475, -0.018250903, 0.040...",0.64344,0.492892,0.579234,0.571812,0.521615,0.453935,0.221586,0.588197,0.209348,0.51657,0.274569,0.142759,0.503211,0.464359,0.674917,0.53174,0.461211,0.508833,0.430277
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...,ftse lose consumer stock mainly unilever briti...,"[-0.10471698, 0.026819536, -0.028612636, 0.039...",0.655437,0.516977,0.626987,0.581966,0.547284,0.446046,0.251786,0.564917,0.211311,0.494346,0.280798,0.140724,0.516566,0.492329,0.69103,0.516917,0.507778,0.541268,0.456159


In [25]:
df.sort_values(['cybersecurity'])[:20][['date', 'domain', 'title', 'content', 'topic_area','cybersecurity']]

Unnamed: 0,date,domain,title,content,topic_area,cybersecurity
220209,2020-07-21,finance.yahoo,Two Chinese nationals charged by US with hacki...,The US has charged two alleged Chinese spies w...,business,0.255113
191132,2020-07-08,marketscreener,MobileIron : Offers Complete Mobile Phishing P...,MobileIron Threat Defense includes multi-vecto...,business,0.270976
209874,2020-07-16,marketscreener,Check Point Research and Zoom Collaborate to F...,"SAN CARLOS, Calif., July 16, 2020 (GLOBE NEW...",business,0.273719
220015,2020-07-21,finance.yahoo,DOJ charges alleged Chinese hackers for steali...,The Justice Department has announced an 11-cou...,business,0.27644
211637,2020-07-16,finance.yahoo,Check Point Research and Zoom Collaborate to F...,Check Point and Zoom identified an issue in Zo...,business,0.276546
34946,2020-03-24,computerweekly,Fake antivirus site promises coronavirus prote...,A fraudulent website that claims to offer a di...,automotive,0.280523
286703,2020-08-25,marketscreener,38 Japan firms' authentication data stolen ami...,"Sumitomo Forestry Co., Hitachi Chemical Co. an...",business,0.281357
341830,2020-11-10,morningstar,Check Point Software’s Cyber-security Predicti...,"SAN CARLOS, Calif., Nov. 10, 2020 (GLOBE NEW...",business,0.282639
257633,2020-08-04,marketscreener,Avoiding Dating Disasters: Check Point Researc...,"Check Point Research, the Threat Intelligence ...",business,0.282678
158620,2020-06-17,marketscreener,Check Point Software´s new rugged gateway secu...,"SAN CARLOS, Calif., June 17, 2020 (GLOBE NEW...",business,0.283105


In [27]:
df.sort_values(['tele_health'])[:20][['date', 'domain', 'title', 'content', 'topic_area','tele_health']]

Unnamed: 0,date,domain,title,content,topic_area,tele_health
144748,2020-06-05,sciencedaily,Hospitalized COVID-19 patients with diabetes r...,Hospitalized patients with COVID-19 and diabet...,science,0.293665
47126,2020-03-31,wired,"In Crowded Hospitals, Who Will Get Life-Saving...",As health care workers prepare for surges of C...,tech,0.29441
360684,2020-12-14,marketscreener,CVS Health : Aetna helps members access COVID-...,"Aetna, a CVS Health company, announced that it...",business,0.301162
236275,2020-07-27,marketscreener,Sberbank of Russia : 10 regions of Russia join...,"SberHealth, a Sberbank ecosystem member, has i...",business,0.305512
26161,2020-03-18,finsmes,PursueCare Raises Series A Funding,"PursueCare, a Middletown, Connecticut-based te...",finance,0.308293
126751,2020-05-22,finance.yahoo,"Yahoo Finance Breakouts: Zachariah Reitano, CE...",Yahoo Finance's Emily McCormick interviews Zac...,business,0.309317
14801,2020-03-09,wired,Worried About Covid-19? Hospitals Have a Reque...,More US health care facilities may soon need t...,tech,0.312226
183122,2020-07-01,finance.yahoo,InnovaCare Health Shows Value of Highly Coordi...,– MMM Healthcare (MMM) has offered coordinated...,business,0.314801
54528,2020-04-03,med-technews,AI-solutions player to help monitor COVID-19 p...,Radiology solutions provider Qure.ai will be w...,healthcare,0.315793
255495,2020-08-03,finance.yahoo,NextGen Virtual Visits™ Keeps Virginia Cardiov...,Largest Private Cardiology Practice in Virgini...,business,0.318546


In [195]:
df.loc[90631]['content_processed']

'malaysia month imposed restrictions movement midmay include shutting businesses essential services including palm industry allowed operate reduced staffing world secondlargest producer vegetable oil company said tuesday disruptions plantations mill operations affect crude palm_oil cpo supplies restrictions logistic services impact movement commodity inventory levels impact outbreak determined fgv affected mainly effects restriction movement order imposed malaysian authorities curtailed workforce strength chief_executive officer haris fadzilah hassan said downstream business expecting reduction processing volume especially export bulk product segments added fgv said demand supported restocking china world secondlargest_economy recovers twomonthlong lockdown contain pandemic malaysia faces competition indonesia lower price commodity company estimated cpo prices trade ringgit ringgit tonne year compared palm closing price ringgit tonne malaysia benchmark cpo contract plunged turn year ri

In [189]:
df[df['content_lemma'].str.contains("blockchain")]

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,content_lemma,doc_embedding,blockchain
16,Cathy Siegner and Roberto Torres,2020-01-08,ciodive,IBM-powered blockchain app tracks coffee bean ...,https://www.ciodive.com/news/ibm-powered-block...,A complex business environment featuring multi...,tech,complex business environment featuring multipl...,complex business environment feature multiple ...,"[-0.09007535, 0.04495359, -0.005620926, 0.0519...",0.657664
314,https://www.businessoffashion.com/articles/aut...,2020-01-23,businessoffashion,Why Disney Merch Is Suddenly Big in China,https://www.businessoffashion.com/articles/pro...,"China Decoded Gucci, Adidas and Uniqlo are all...",consumer,china decoded gucci adidas uniqlo playing safe...,china decode gucci adida uniqlo play safe chin...,"[-0.09893813, 0.045885094, 0.0008244006, 0.055...",0.674965
2263,Arjun Kharpal@ArjunKharpal,2020-02-03,cnbc,Bitcoin (BTC) price logs best January performa...,https://www.cnbc.com/2020/02/03/bitcoin-btc-pr...,Bitcoin logged its best performance for the mo...,finance,bitcoin logged best performance month january ...,bitcoin log good performance month january dri...,"[-0.10567156, 0.035960473, -0.022340328, 0.056...",0.640958
3004,Charles Sellen,2020-02-06,theconversation,China: rise of a new philanthropic power,https://www.theconversation.com/china-rise-of-...,In response to the ongoing coronavirus emergen...,business,response ongoing coronavirus emergency january...,response ongoing coronavirus emergency january...,"[-0.09164851, 0.033936396, -0.014935591, 0.060...",0.680239
4425,,2020-02-12,cbinsights,This Week In Insurance Tech: Gojek Rolls Out O...,https://www.cbinsights.com/research/this-week-...,Insurance comparison marketplace The Zebra rai...,tech,insurance comparison marketplace zebra raises ...,insurance comparison marketplace zebra raise m...,"[-0.09843036, 0.04152214, -0.007953279, 0.0684...",0.679800
...,...,...,...,...,...,...,...,...,...,...,...
367529,,2020-12-28,marketscreener,Serba Dinamik : Why it's important to keep lea...,https://www.marketscreener.com/quote/stock/SER...,Cybercriminals are getting savvier by the minu...,business,cybercriminals getting savvier minute criminal...,cybercriminal get savvier minute criminal take...,"[-0.09739566, 0.04154338, -0.013244631, 0.0681...",0.672034
367987,,2020-12-29,marketscreener,Jack Henry & Associates : 5 Ways Payment Data ...,https://www.marketscreener.com/quote/stock/JAC...,The use of artificial intelligence (AI) and ma...,business,use artificial_intelligence ai machine_learnin...,use artificial_intelligence ai machine_learne ...,"[-0.089052565, 0.043748036, -0.008853784, 0.06...",0.661585
368054,,2020-12-29,marketscreener,Dollar pares losses as Senate's McConnell vagu...,https://www.marketscreener.com/news/latest/Dol...,U.S. Senate Majority Leader Mitch McConnell on...,business,yous senate_majority leader_mitch mcconnell tu...,yous senate_majority leader_mitch mcconnell tu...,"[-0.09574312, 0.025486026, -0.027803062, 0.035...",0.655304
368499,,2020-12-30,marketscreener,Victory Square Provides Corporate Update on Re...,https://www.marketscreener.com/quote/stock/VIC...,"VANCOUVER, British Columbia, Dec. 30, 2020 (...",business,vancouver_british columbia december globe_news...,vancouver_british columbia december globe_news...,"[-0.07872814, 0.031281438, -0.009432663, 0.056...",0.664835


## Each word comparison method

In [32]:
# Get words in vocab - Takes ~2 hours per topic
# Run with document_word_distance.py file

def document_word_comparison(doc, vocab, topic):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in vocab]
    doc_idx = np.where(np.isin(vocab, doc))[0]
    doc_vecs = vectors[doc_idx]
    cos_dist_doc = [spatial.distance.cosine(vec, topic) for vec in doc_vecs]
    return np.mean(cos_dist_doc)

calculate_doc_distance = False

if calculate_doc_distance:
    tqdm.pandas()
    df = pd.read_pickle('df_doc_embeddings.pickle')
    for key, value in topics.items():
        print(key + '_word')
        df[key + '_word'] = df['content_lemma'].progress_apply(document_word_comparison, args=(vocab, value,))
    df.to_pickle('df_doc_embeddings_word.pickle')
#else:
 #   df = pd.read_pickle('df_doc_embeddings_word.pickle')
    
#------additional terms--------#
calculate_doc_distance_additional = False

if calculate_doc_distance_additional:
    tqdm.pandas()
    df = pd.read_pickle('df_doc_embeddings_word.pickle')
    for key, value in topics.items():
        print(key + '_word')
        df[key + '_word'] = df['content_lemma'].progress_apply(document_word_comparison, args=(vocab, value,))
    df.to_pickle('df_doc_embeddings_word_additional.pickle')
else:
    df = pd.read_pickle('df_doc_embeddings_word_additional.pickle')


In [33]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,content_lemma,doc_embedding,blockchain,digitization,machine_learne,cloud,iot,store_closure,delivery,redundancy,costcutte,flight,supply_chain,shutdown,outsourcing,workfromhome,diversification,blockchain_word,digitization_word,machine_learne_word,cloud_word,iot_word,store_closure_word,delivery_word,redundancy_word,costcutte_word,flight_word,supply_chain_word,shutdown_word,outsourcing_word,workfromhome_word,diversification_word,vaccines_word,drug_discovery_word,tele_health_word,cybersecurity_word
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,end year corner past time think positioning fo...,"[-0.10415985, 0.034574475, -0.018250903, 0.040...",0.64344,0.492892,0.579234,0.571812,0.521615,0.453935,0.221586,0.588197,0.209348,0.51657,0.274569,0.142759,0.503211,0.464359,0.674917,0.765884,0.667034,0.723726,0.718853,0.685893,0.641455,0.488895,0.729611,0.48086,0.682581,0.523684,0.437137,0.67381,0.648299,0.786551,0.692542,0.646232,0.677501,0.625921
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...,ftse lose consumer stock mainly unilever briti...,"[-0.10471698, 0.026819536, -0.028612636, 0.039...",0.655437,0.516977,0.626987,0.581966,0.547284,0.446046,0.251786,0.564917,0.211311,0.494346,0.280798,0.140724,0.516566,0.492329,0.69103,0.778773,0.689875,0.760507,0.731602,0.709334,0.644334,0.519609,0.720655,0.493622,0.675345,0.538237,0.448302,0.689611,0.67405,0.801626,0.689836,0.683969,0.705471,0.650827


In [34]:
df.sort_values(['cybersecurity_word'])[:20][['date', 'domain', 'title', 'content', 'topic_area','cybersecurity_word']]

Unnamed: 0,date,domain,title,content,topic_area,cybersecurity_word
14054,2020-03-08,smart2zero,'Right First Time' PCB design - Producing in E...,At the Embedded World 2020 eeNews discussed wi...,tech,0.510094
287497,2020-08-26,iotworldtoday,Data Privacy Regulation: Moving Beyond Compliance,Laws such as the EU’s General Data Protection ...,tech,0.516668
92697,2020-04-30,bloomberg,"Waffle House Signals U.S. Reopening, But It Wo...",Luring customers out of their homes and into a...,general,0.516786
74473,2020-04-18,reuters,U.S. says concerned by threat of cyber attack ...,The United States is concerned by the threat o...,business,0.520525
80551,2020-04-23,scmp,"Coronavirus apps must keep Big Brother at bay,...",Europe’s data privacy watchdogs warned that vi...,general,0.520528
47128,2020-03-31,wired,Covid-19 Is Our 9/11. Who Will Be Our Rudy Giu...,The scale of the Covid-19 crisis is dwarfing t...,tech,0.522167
71053,2020-04-16,finance.yahoo,Quibi CEO Meg Whitman joins 'Influencers with ...,Quibi CEO Meg Whitman joins Yahoo Finance to d...,business,0.527371
24042,2020-03-17,computerweekly,Covid-19: NCSC issues secure remote working gu...,The National Cyber Security Centre (NCSC) has ...,automotive,0.527465
289897,2020-09-01,chatbotslife,What is the Lean Security Method?,The lean security method is a step by step gui...,ai,0.528178
341830,2020-11-10,morningstar,Check Point Software’s Cyber-security Predicti...,"SAN CARLOS, Calif., Nov. 10, 2020 (GLOBE NEW...",business,0.528269


In [31]:
df.sort_values(['cloud_word'])[:20][['date', 'domain', 'title', 'content', 'topic_area','cloud_word']]

Unnamed: 0,date,domain,title,content,topic_area,cloud_word
269132,2020-08-13,finance.yahoo,COVID-19 Accelerates the Energy Transition: Ad...,COVID-19 Accelerates the Energy Transition: Ad...,business,0.586728
153796,2020-06-14,finance.yahoo,"Where to Invest $5,000 Right Now",With businesses reeling from COVID-19 and many...,business,0.588088
368050,2020-12-29,finsmes,Honeywell Acquires Sine Group,"Honeywell (NYSE: HON) acquired Sine Group, an ...",finance,0.588369
139136,2020-06-02,cnbc,"Pandemic is 'zeitgeist moment' for cloud, Micr...",(This story is for CNBC Pro subscribers only.)...,finance,0.591355
219850,2020-07-21,finance.yahoo,Software Growth Stocks To Buy: Earnings Report...,Software growth stocks feasted on cloud comput...,business,0.594346
223886,2020-07-22,finance.yahoo,Microsoft Revenue Surges Though Cloud Growth S...,Its quarterly sales rose 13% on sustained dema...,business,0.595296
94740,2020-05-02,scmp,Why Covid-19 will be an extinction event for m...,One of the most striking features of the globa...,general,0.596233
112742,2020-05-14,finance.yahoo,Zoom Video to Hire Hundreds of Engineers Amid ...,Zoom Video is dramatically expanding its softw...,business,0.600273
68515,2020-04-14,scmp,China’s online education drive to boost demand...,"PCs, tablets, 5G-powered live streams, and clo...",general,0.603606
240692,2020-07-28,marketscreener,3CLogic : Announces New Cloud Call Center Solu...,New 2.0 release embeds contact center solution...,business,0.604189


### Comparison of methods

In [35]:
index_doc = df[['author', 'date', 'domain', 'title', 'url', 'content', 'topic_area',
       'content_processed', 'content_lemma', 'doc_embedding', 'blockchain','blockchain_word']].sort_values("blockchain")[:100].index
index_word = df[['author', 'date', 'domain', 'title', 'url', 'content', 'topic_area',
       'content_processed', 'content_lemma', 'doc_embedding', 'blockchain','blockchain_word']].sort_values("blockchain_word")[:100].index
print('in doc avg not word avg\n', set(index_doc).difference(set(index_word)))
print('in word avg not doc avg\n', set(index_word).difference(set(index_doc)))

in doc avg not word avg
 {197766, 147847, 302480, 269465, 171804, 222115, 187558, 367143, 290475, 366379, 121263, 94384, 367153, 196912, 176179, 267189, 242104, 1977, 315713, 194116, 364235, 211278, 193487, 9424, 191696, 105809, 168659, 368980, 77397, 207958, 354770, 172889, 367202, 345315, 189029, 221798, 167528, 359272, 126577, 260852, 335996}
in word avg not doc avg
 {350340, 85127, 208777, 71053, 17678, 29461, 269466, 114974, 187039, 238116, 184868, 97318, 144676, 137128, 226728, 26154, 195754, 277427, 64438, 19897, 152635, 160960, 362432, 132422, 162889, 168907, 345549, 177103, 363345, 135891, 294740, 130003, 134742, 159832, 14816, 68195, 320099, 261865, 200174, 143983, 54651}


In [36]:
index_doc = df[['author', 'date', 'domain', 'title', 'url', 'content', 'topic_area',
       'content_processed', 'content_lemma', 'doc_embedding', 'supply_chain','supply_chain_word']].sort_values("supply_chain")[:100].index
index_word = df[['author', 'date', 'domain', 'title', 'url', 'content', 'topic_area',
       'content_processed', 'content_lemma', 'doc_embedding', 'supply_chain','supply_chain_word']].sort_values("supply_chain_word")[:100].index
print('in doc avg not word avg\n', set(index_doc).difference(set(index_word)))
print('in word avg not doc avg\n', set(index_word).difference(set(index_doc)))

in doc avg not word avg
 {162304, 154630, 90631, 308744, 9228, 99864, 20014, 15927, 32831, 3654, 356942, 32852, 345185, 363618, 1128, 343147, 67181, 116848, 344178, 5750, 5751, 363131, 18053, 310922, 141965, 309903, 306839, 33433, 23716, 5801, 17577, 190637, 7855, 2745, 227007, 327879, 48329, 160464, 313044, 6875, 356064, 14581, 252673, 2817, 319237, 39703, 8992, 8481, 48418, 147232, 352551, 162088, 260393, 6462, 12106, 11596, 15701, 107869, 187745, 19823, 146809, 44410, 301435, 13178, 45449, 70026, 357258, 32164, 70566, 32177, 13238, 160194, 32195, 154050, 113094, 46536, 64975, 115164, 19420, 18406, 84457, 355306, 20468, 37366, 252922}
in word avg not doc avg
 {178690, 27143, 157207, 184864, 184866, 205346, 96819, 152635, 119373, 173133, 173648, 180823, 136281, 12897, 76901, 52841, 7794, 17522, 8829, 175235, 169121, 131241, 195754, 127658, 93869, 93871, 42159, 103094, 257210, 156368, 10456, 5344, 327904, 195312, 94960, 9978, 124667, 7422, 12033, 154881, 145669, 28427, 175894, 167193, 

In [41]:
df.loc[27143]['content']

' - Vodafone, the world’s second largest mobile operator, said the coronavirus crisis was causing data traffic on its networks to surge, with demand already rising 50% in some markets. The British company, which announced a plan on Wednesday to maintain network service and provide capacity for critical government functions, said it expected data use to continue to increase. (Reporting by Paul Sandle)'