In [3]:
import pandas as pd
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import types
import pickle

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from top2vec import Top2Vec

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
!which jupyter

/home/ubuntu/thesis_env2/bin/jupyter


In [5]:
df = pd.read_pickle('./data/df_processed_bigrams.pickle')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365200 entries, 0 to 369046
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181507 non-null  object        
 1   date               365200 non-null  datetime64[ns]
 2   domain             365200 non-null  object        
 3   title              365115 non-null  object        
 4   url                365200 non-null  object        
 5   content            365200 non-null  object        
 6   topic_area         365200 non-null  object        
 7   content_processed  365200 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 25.1+ MB


In [8]:
df.head(1).append(df.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,ftse lost consumer stocks mainly unilever diag...


In [9]:
# Note to do - need to add time element

def log_newline(self, how_many_lines=1):
    file_handler = None
    if self.handlers:
        file_handler = self.handlers[0]

    # Switch formatter, output a blank line
    file_handler.setFormatter(self.blank_formatter)
    for i in range(how_many_lines):
        self.info('')

    # Switch back
    file_handler.setFormatter(self.default_formatter)

def logger_w2v():
    
    log_file = os.path.join('./data', 'word2vec.log')
    print('log file location: ', log_file)
    
    log_format= '%(asctime)s - %(levelname)s - [%(module)s]\t%(message)s'
    formatter = logging.Formatter(fmt=(log_format))
    
    fhandler = logging.FileHandler(log_file)
    fhandler.setFormatter(formatter)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    logger.default_formatter = formatter
    logger.blank_formatter = logging.Formatter(fmt="")
    logger.newline = types.MethodType(log_newline, logger)
    
    return logger
    

In [10]:
def tokenise_dataset(df):

    tokens = df['content_processed'].str.split(" ")

    return tokens

# Top2Vec

In [None]:
find_topics = True
min_count = 1000 # ignore words with total frequency less than this
speed = 'deep-learn' # can try 'deep-learn' for possible better embeddings but will take longer
# started deep-lear at 8pm, still going at 2pm the next day

if find_topics:
    # import lemmatised data
    with open('data/data_lemmatized.pickle', 'rb') as f:
        data_lemmatized = pickle.load(f)
    
    data_lemmatized_str = [' '.join(article) for article in data_lemmatized]
    print(len(data_lemmatized))
    print(len(data_lemmatized_str))
    
    # Find topics
    # ~ 12.5 hours to run on lemmatised data
    #documents = df['content_processed'][:50000].values
    documents = data_lemmatized_str
    model = Top2Vec(documents, workers=4, min_count=min_count, speed=speed)
    model.save('top2vec_vocab_limit_deep.model')
else:
    #model = Top2Vec.load('top2vec.model')
    model = Top2Vec.load('top2vec_vocab_limit.model')

print(len(model.topic_words))
print(model._get_word_vectors().shape)

365200
365200


2021-08-28 20:11:52,828 - top2vec - INFO - Pre-processing documents for training
2021-08-28 20:17:22,185 - top2vec - INFO - Creating joint document/word embedding


In [4]:
model.topic_words

array([['barrels_day', 'bpd', 'crude', ..., 'gallon', 'gregorio',
        'slash'],
       ['touching_face', 'hands_clean', 'sick', ..., 'workout', 'gov',
        'africanamerican'],
       ['nongaap', 'gaap', 'ebitdare', ..., 'audio_webcast',
        'study_identifie', 'cloudbase'],
       ...,
       ['gift', 'card', 'debit', ..., 'crossborder', 'curbside_pickup',
        'biometric'],
       ['dare', 'commercialize', 'nda', ..., 'gel', 'milestone', 'drug'],
       ['nda', 'tolerability', 'openlabel', ..., 'brent', 'toxicity',
        'treasury_yield']], dtype='<U15')

In [5]:
model.topic_words[0]

array(['barrels_day', 'bpd', 'crude', 'opec', 'glut', 'oil', 'barrel',
       'refiner', 'brent_crude', 'wti', 'eia', 'brent', 'crude_future',
       'gasoline', 'shale', 'crude_oil', 'refinery', 'output', 'libya',
       'exxon', 'aramco', 'refining', 'permian', 'producer', 'chevron',
       'petroleum', 'saudi', 'reuters_poll', 'importer', 'oil_ga',
       'upstream', 'rig', 'hydrocarbon', 'rout', 'taper', 'natural_ga',
       'gulf', 'petrochemical', 'oilfield', 'refine', 'oil_gas',
       'curtailment', 'iraq', 'royal_dutch', 'diesel', 'chesapeake',
       'lowest_level', 'gallon', 'gregorio', 'slash'], dtype='<U15')

In [6]:
model.topic_words[1]

array(['touching_face', 'hands_clean', 'sick', 'rate_dippe',
       'illness_cause', 'afterward', 'whitmer', 'sicken', 'air_setting',
       'yorker', 'held_outdoor', 'overwhelm', 'quarantined',
       'surfaces_seat', 'sinuses_common', 'seats_contact', 'sweat',
       'caring_sick', 'wipes_clean', 'cdc', 'disinfect_hard',
       'experts_warn', 'subway', 'birx', 'couch', 'hernandez', 'sidewalk',
       'breathe', 'screen_seat', 'hiring_rebounde', 'neighbor',
       'cellphone', 'countless', 'swath', 'caseload', 'girlfriend',
       'breathing', 'bryant', 'sitting_window', 'gov_andrew',
       'school_district', 'plane_window', 'evidence_widely', 'shout',
       'coauthor', 'epicenter', 'flu', 'workout', 'gov',
       'africanamerican'], dtype='<U15')

In [7]:
model.topic_words[2]

array(['nongaap', 'gaap', 'ebitdare', 'divestiture', 'teekay',
       'longlived_asset', 'item_', 'isg', 'chegg', 'variability',
       'ability_attract', 'sec_filing', 'passcode', 'nareit',
       'live_webcast', 'dialing', 'shortterm_phase', 'gross_margin',
       'affo', 'gotomarket', 'diluted', 'replay', 'technavio',
       'free_sample', 'extinguishment', 'section_entitle', 'reach_revise',
       'dell', 'a_securitie', 'actual_result', 'safe_harbor',
       'trends_driver', 'trailing_cagr', 'act_amended', 'offers_uptodate',
       'webcast', 'gartner', 'onpremise', 'dialin', 'netback',
       'usa_canada', 'highperformance', 'remain_unscathe',
       'periodic_report', 'comparability', 'gain_instant', 'reform_act',
       'audio_webcast', 'study_identifie', 'cloudbase'], dtype='<U15')

### Get topic sizes

Number of documents most similar to each topic. Topics are in decreasing order of size.  
topic_sizes: The number of documents most similar to each topic.  
topic_nums: The unique index of every topic will be returned.  

In [8]:
topic_sizes, topic_ids = model.get_topic_sizes()
df_topic_sizes = pd.DataFrame(data=zip(topic_ids, topic_sizes), columns=['topic_id', 'num_docs'])

In [9]:
df_topic_sizes

Unnamed: 0,topic_id,num_docs
0,0,3882
1,1,2972
2,2,2739
3,3,2512
4,4,2155
...,...,...
1768,1768,19
1769,1769,19
1770,1770,18
1771,1771,18


### Get Topics
topic_words: For each topic the top 50 words are returned, in order of semantic similarity to topic.  
word_scores: For each topic the cosine similarity scores of the top 50 words to the topic are returned.  
topic_nums: The unique index of every topic will be returned.  

In [125]:
topic_words, word_scores, topic_ids = model.get_topics(model.get_num_topics())
topic_sizes, topic_ids = model.get_topic_sizes()
df_topics = pd.DataFrame(data=zip(topic_ids, topic_sizes, topic_words, word_scores), columns=['topic_id', 'topic_sizes', 'topic_words', 'word_scores'])

# add doc id's
df_topics['doc_ids'] = ''
for topic_idx, topic_size in enumerate(df_topics['topic_sizes']):
    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_idx, num_docs=topic_size)
    df_idx = df.iloc[document_ids].index.values
    df_topics['doc_ids'].at[topic_idx] = df_idx
    print(df_topics['topic_sizes'].sum())

In [126]:
df_topics

Unnamed: 0,topic_id,topic_sizes,topic_words,word_scores,doc_ids
0,0,3882,"[barrels_day, bpd, crude, opec, glut, oil, bar...","[0.7554733, 0.7397255, 0.7277012, 0.7076352, 0...","[48334, 48284, 294425, 159978, 20322, 292171, ..."
1,1,2972,"[touching_face, hands_clean, sick, rate_dippe,...","[0.31247112, 0.20629022, 0.20435627, 0.2030590...","[676, 30063, 20903, 18668, 21959, 21573, 3624,..."
2,2,2739,"[nongaap, gaap, ebitdare, divestiture, teekay,...","[0.26054233, 0.23778984, 0.22579505, 0.1927944...","[334982, 239075, 91056, 85241, 100487, 8321, 1..."
3,3,2512,"[vaccine, pfizer_biontech, pfizerbiontech, pfi...","[0.80440575, 0.7686322, 0.7623384, 0.7583906, ...","[236636, 225984, 225378, 225321, 347974, 22533..."
4,4,2155,"[hedge_fund, insider_monkey, hedgie, similarly...","[0.7718193, 0.57161814, 0.5356776, 0.51193756,...","[133035, 171698, 133512, 169086, 133335, 17170..."
...,...,...,...,...,...
1768,1768,19,"[safehaven, crude_future, greenback, japanese_...","[0.55273753, 0.49876994, 0.49868113, 0.4946041...","[183052, 183053, 182081, 181526, 181789, 18255..."
1769,1769,19,"[strategist, choppy, treasury_yield, selloff, ...","[0.44933143, 0.42233846, 0.41821185, 0.3859547...","[250960, 250028, 252600, 250770, 253406, 24993..."
1770,1770,18,"[gift, card, debit, wallet, credit_card, press...","[0.5080503, 0.49814865, 0.40992847, 0.384071, ...","[314138, 313298, 314772, 313117, 314512, 31364..."
1771,1771,18,"[dare, commercialize, nda, bioscience, investi...","[0.50853056, 0.4186489, 0.3713347, 0.3697553, ...","[158636, 159688, 154610, 155910, 316975, 31669..."


### Search for topics than contain keywords
topic_words: For each topic the top 50 words are returned, in order of semantic similarity to topic.  
word_scores: For each topic the cosine similarity scores of the top 50 words to the topic are returned.  
topic_scores: For each topic the cosine similarity to the search keywords will be returned.  
topic_nums: The unique index of every topic will be returned.  

In [29]:
keywords = ["supply_chain"]
#keywords = ["digital_transformation"]
topic_words, word_scores, topic_scores, topic_ids = model.search_topics(keywords=keywords, num_topics=5)
df_topic_kw = pd.DataFrame(data=zip(topic_ids, topic_words, word_scores, topic_scores), columns=['topic_id', 'topic_words', 'word_scores', 'topic_scores'])

In [30]:
df_topic_kw

Unnamed: 0,topic_id,topic_words,word_scores,topic_scores
0,859,"[generic, pharmaceutical, drug, novartis, phar...","[0.5529777, 0.41644132, 0.4071582, 0.37119797,...",0.332119
1,914,"[garment, bangladesh, boohoo, clothing, clothe...","[0.7322744, 0.50282276, 0.46981946, 0.46685526...",0.287537
2,93,"[tools_checklist, reinvent, transformation, ag...","[0.5508231, 0.49446157, 0.48091435, 0.477627, ...",0.252878
3,825,"[wto, multilateral, directorgeneral, bilateral...","[0.78679776, 0.5122057, 0.4868957, 0.46711993,...",0.246379
4,1132,"[respirator, fema, surgical_mask, protective_g...","[0.5727843, 0.45496583, 0.4493539, 0.43521297,...",0.242761


In [31]:
df_topic_kw['topic_words'][1]

array(['garment', 'bangladesh', 'boohoo', 'clothing', 'clothe', 'apparel',
       'textile', 'fashion', 'factory', 'cambodia', 'migrant_worker',
       'cotton', 'myanmar', 'bof', 'footwear', 'adida', 'designer',
       'gucci', 'levi', 'leather', 'burberry', 'exporter', 'supply_chain',
       'nepal', 'shoe', 'tshirt', 'retailer', 'minimum_wage', 'malaysian',
       'malaysia', 'gown', 'ngo', 'nike', 'exploitation', 'jc_penney',
       'casual', 'wage', 'jacket', 'vietnam', 'dress', 'fabric',
       'store_closure', 'informal', 'ethiopia', 'livelihood', 'worker',
       'precarious', 'remittance', 'leicester', 'shirt'], dtype='<U15')

### Search articles by topic

After finding the relevant topic number can then search by this  
documents: The documents in a list, the most similar are first.  
doc_scores: Semantic similarity of document to topic. The cosine similarity of the document and topic vector.  
doc_ids: Unique ids of documents. If ids were not given, the index of document in the original corpus.  

In [204]:
topic_num=256
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=topic_num, num_docs=50)
    
result_df = df.iloc[document_ids]
result_df["document_scores"] = document_scores

for index,row in result_df.iterrows():
    print(f"Document: {index}, Score: {row.document_scores}")
    print(f"Title: {row.title}")
    print("-----------")
    #print(row.content)
    #print("-----------")

Document: 50025, Score: 0.7760816812515259
Title: NY lawmakers, unions ask Bezos to close Amazon buildings hit by COVID-19
-----------
Document: 83523, Score: 0.775547444820404
Title: Amazon extends hourly wage increase for warehouse workers through May 16th
-----------
Document: 98475, Score: 0.7708190679550171
Title: An Amazon warehouse worker in New York has died of COVID-19
-----------
Document: 50028, Score: 0.7697114944458008
Title: Amazon workers protest at Michigan warehouse for COVID-19 protections
-----------
Document: 50733, Score: 0.7680749893188477
Title: Amazon warehouse workers protest near Detroit, days after NYC walkout
-----------
Document: 98472, Score: 0.7659231424331665
Title: Breaking: an Amazon warehouse worker in New York has died of COVID-19
-----------
Document: 69866, Score: 0.7608959078788757
Title: French ruling pushes Amazon to close its warehouses over COVID-19 health concerns
-----------
Document: 50034, Score: 0.7573422789573669
Title: NY lawmakers, uni

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df["document_scores"] = document_scores


### Search articles by Keywords

In [22]:
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=["supply_chain", "disrupt"], num_docs=2)
result_df = df.iloc[document_ids]
result_df["document_scores"] = document_scores

# for index,row in result_df.iterrows():
#     print(f"Document: {index}, Score: {row.document_scores}")
#     print(f"Title: {row.title}")
#     print("-----------")
#     print(row.content)
#     print("-----------")
#     print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df["document_scores"] = document_scores


### Find Similar Words

In [18]:
# Get words in vocab
vocab_length = len(model._get_word_vectors())
print(vocab_length)

vocab = []
for n in range(vocab_length):
    vocab.append(model._index2word(n))

9453


In [19]:
[x for x in vocab if 'digital' in x]

['digital', 'digitally', 'digitalization']

In [21]:
print(f'vocabulary length: {len(model._get_word_vectors())}')

words_model, word_scores = model.similar_words(keywords=["supply_chain"], num_words=20)
for word, score in zip(words_model, word_scores):
    print(f"{word} {score}")

vocabulary length: 9453
supplychain 0.731733918028258
supplier 0.6976942387696019
disruption 0.6841810663906069
constraint 0.6488750537649525
manufacturer 0.6476106609892319
disrupt 0.6190746614570983
industry 0.6176722338897455
manufacturing 0.6078471112898186
manufacture 0.5985575309354442
shortage 0.5891632332152457
business 0.583309487448822
bottleneck 0.5825114652605525
scale 0.5813788777660902
globally 0.5668616200009875
accelerate 0.5586804131397656
competitiveness 0.5584103009157673
logistic 0.5576864389126387
global 0.5561871522716164
complexity 0.5546192245553856
supply 0.5535578806701948


In [9]:
#model._words2word_vectors(['supply'])
model._get_word_vectors() # word embeddings
model._index2word(1)

'company'

## Label Topics

In [142]:
df_topics_labelled = df_topics.copy()
df_topics_labelled['topic_label'] = ''

In [150]:
df_topics_labelled.head(1)

Unnamed: 0,topic_id,topic_sizes,topic_words,word_scores,doc_ids,topic_label
0,0,3882,"[barrels_day, bpd, crude, opec, glut, oil, bar...","[0.7554733, 0.7397255, 0.7277012, 0.7076352, 0...","[48334, 48284, 294425, 159978, 20322, 292171, ...",


In [144]:
#df_topics_labelled.loc[1457]['topic_label']

In [145]:
idx = 93 
topic_label = ['digitalisation', 'ai', 'agile', 'transformation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 635 
topic_label = ['smartphone', 'semiconductor', 'supply_chain', 'automaker']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 820 
topic_label = ['clothing', 'manufacturing', 'supply_chain', 'exploitation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 859 
topic_label = ['pharma', 'shortage', 'supply_chain', 'manufacturing']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 914 
topic_label = ['clothing', 'manufacturing', 'supply_chain', 'exploitation']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1132 
topic_label = ['ppe', 'ventilator', 'supply_chain', 'shortage']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1314
topic_label = ['semiconductor', 'tech_company', 'china', 'supply_chain']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1457
topic_label = ['cybersecurity', 'space', 'aluminum', 'supply_chain']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 1481
topic_label = ['shipping', 'logistics', 'supply_chain']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

idx = 45
topic_label = ['employees', 'wellbeing', 'remote_working']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 134
topic_label = ['employees', 'company_statements', 'covid_response']
df_topics_labelled.at[idx, 'topic_label'] = topic_label
idx = 256
topic_label = ['employees', 'amazon', 'warehouse', 'unsafe']
df_topics_labelled.at[idx, 'topic_label'] = topic_label

In [195]:
selection = ['employee']
df_selection = df_topics_labelled[pd.DataFrame(df_topics_labelled.topic_words.tolist()).isin(selection).any(1)]['topic_words']
len(df_selection)

20

In [197]:
for idx, row in zip(df_selection.index, df_selection[:5]):
    print(idx, 'num_docs: ', df_topics['topic_sizes'][idx])
    print(row, '\n')

45 num_docs:  865
['productive' 'coworker' 'psychologist' 'stressful' 'workplace'
 'organizational' 'productivity' 'workspace' 'remote' 'mindset' 'empathy'
 'remotely' 'emotion' 'anxiety' 'employee' 'emotionally' 'creativity'
 'feeling' 'skill' 'motivation' 'meditation' 'wellbeing' 'hr' 'wellbee'
 'parenting' 'loneliness' 'agile' 'psychology' 'oneonone' 'mental'
 'communicate' 'commute' 'psychological' 'emotional' 'workforce'
 'therapist' 'skype' 'physically' 'facetoface' 'mentally' 'distraction'
 'feedback' 'task' 'workfromhome' 'slack' 'learning' 'agility' 'lonely'
 'adapt' 'interact'] 

134 num_docs:  542
['continuity' 'utmost' 'wellbee' 'uninterrupted' 'proactive' 'stakeholder'
 'communitie' 'sa_publishe' 'public_unedite' 'unaltered' 'wellbeing'
 'dedication' 'proactively' 'resilience' 'ensure' 'utc' 'agility' 'adapt'
 'sanitary' 'adhere' 'safeguard' 'onsite' 'continuously' 'safety_protocol'
 'tirelessly' 'hygiene' 'disinfection' 'optimise' 'agile' 'foremost'
 'protocol' 'rapidly_c

In [148]:
df_topics_labelled.to_pickle("./data/df_topics_labelled_vocab_limit")

In [149]:
for idx, label in zip(df_topics_labelled[df_topics_labelled['topic_label'] != ""]['topic_id'],df_topics_labelled[df_topics_labelled['topic_label'] != ""]['topic_label']):
    print(idx, label)

93 ['digitalisation', 'ai', 'agile', 'transformation']
635 ['smartphone', 'semiconductor', 'supply_chain', 'automaker']
820 ['clothing', 'manufacturing', 'supply_chain', 'exploitation']
859 ['pharma', 'shortage', 'supply_chain', 'manufacturing']
914 ['clothing', 'manufacturing', 'supply_chain', 'exploitation']
1132 ['ppe', 'ventilator', 'supply_chain', 'shortage']
1314 ['semiconductor', 'tech_company', 'china', 'supply_chain']
1457 ['cybersecurity', 'space', 'aluminum', 'supply_chain']
1481 ['shipping', 'logistics', 'supply_chain']


## Apply topic labels to document df

In [206]:
df_temp = df_topics_labelled[df_topics_labelled['topic_label'] != ""]

ids_check = []
for ids in df_temp['doc_ids']:
    ids_check = ids_check + list(ids)
    
print(len(ids_check))
print(len(set(ids_check)))

df_temp.head(1)

3205
3205


Unnamed: 0,topic_id,topic_sizes,topic_words,word_scores,doc_ids,topic_label
45,45,865,"[productive, coworker, psychologist, stressful...","[0.44290036, 0.42762664, 0.40557778, 0.4026943...","[54832, 178549, 54660, 186216, 352544, 177518,...","[employees, wellbeing, remote_working]"


In [207]:
df_with_topics = df.copy()
df_with_topics['topic_id'] = ''
df_with_topics['topic_label'] = ''
col_num_id = df_with_topics.columns.get_loc('topic_id')
col_num_label = df_with_topics.columns.get_loc('topic_label')

for topic_id, doc_ids, topic_label in zip(df_temp['topic_id'], df_temp['doc_ids'], df_temp['topic_label']):
    #print(topic_id, doc_ids, topic_label)
    for idx in doc_ids:
        #print(idx, topic_label)
        df_with_topics.at[idx, 'topic_id'] = topic_id
        df_with_topics.at[idx, 'topic_label'] = topic_label

In [208]:
df_with_topics.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,topic_id,topic_label
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...,,


In [209]:
df_with_topics[df_with_topics['topic_id'] != '']

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed,topic_id,topic_label
372,Sam Byford,2020-01-23,theverge,Huawei developer conference postponed due to W...,https://www.theverge.com/2020/1/23/21078258/hu...,Huawei has announced the postponement of a maj...,tech,huawei announced postponement major developers...,635,"[smartphone, semiconductor, supply_chain, auto..."
544,,2020-01-24,marketscreener,Today's Logistics Report: Cutting Rail Jobs; W...,https://www.marketscreener.com/UNION-PACIFIC-1...,"By Paul Page Sign up: With one click, get this...",business,paul page sign click newsletter delivered inbo...,1481,"[shipping, logistics, supply_chain]"
879,Stan Schroeder,2020-01-28,mashable,Coronavirus might put a wrench in Apple's iPho...,https://mashable.com/article/iphone-coronavirus/,Apple may not be able to produce as many iPhon...,tech,apple able produce iphones planned coronavirus...,635,"[smartphone, semiconductor, supply_chain, auto..."
960,Bloomberg,2020-01-28,scmp,Apple supply chain braces for disruption from ...,https://www.scmp.com/tech/big-tech/article/304...,Apple’s China-centric manufacturing base is at...,general,apple chinacentric manufacturing base risk dis...,635,"[smartphone, semiconductor, supply_chain, auto..."
1005,Nick Statt,2020-01-28,theverge,iPhone maker Foxconn says coronavirus outbreak...,https://www.theverge.com/2020/1/28/21112288/co...,"Taiwanese electronics giant Foxconn, which man...",tech,taiwanese electronics giant foxconn manufactur...,635,"[smartphone, semiconductor, supply_chain, auto..."
...,...,...,...,...,...,...,...,...,...,...
367797,,2020-12-29,marketscreener,Exclusive: Hedge fund Third Point urges Intel ...,https://www.marketscreener.com/quote/index/NAS...,"Were it to gain traction, Third Point's push f...",business,gain traction point push changes lead major sh...,1314,"[semiconductor, tech_company, china, supply_ch..."
367943,nature,2020-12-29,nature,Coronavirus diaries: an unexpected career expe...,http://www.nature.com/articles/d41586-020-03627-0,"Credit: Adapted from Getty In March, every sin...",science,credit adapted getty march single aspect scien...,45,"[employees, wellbeing, remote_working]"
368074,Lila MacLellan,2020-12-29,qz,How to help working parents and not pit them a...,https://www.qz.com/work/1946450/how-to-help-wo...,Hit “play” on UNESCO’s interactive world map t...,tech,hit play unesco interactive world map tracing ...,45,"[employees, wellbeing, remote_working]"
368077,Sarah Todd,2020-12-29,qz,Covid-19 changed the way we talk about emotion...,https://www.qz.com/work/1950430/covid-19-chang...,One of my favorite things about work used to b...,tech,favorite things work gave chance compartmental...,45,"[employees, wellbeing, remote_working]"


In [210]:
df_with_topics.to_pickle("./data/df_with_topics")