In [1]:
!which jupyter

/home/ubuntu/thesis_env2/bin/jupyter


In [2]:
import pandas as pd
import numpy as np
import datetime
import spacy
import en_core_web_sm
#from spacy import displacy

In [3]:
df = pd.read_csv('./data/covid19_articles_20201231.csv')

# Date to datetime
df['date'] = pd.to_datetime(df['date'])

# Drop duplicates
num_articles_pre_drop = len(df)
df = df.drop_duplicates(subset='content')
print('no. articles dropped: ', num_articles_pre_drop - len(df))
print('no. of articles: ', len(df))

# Drop articles where text longer than 1,000,000 characters
df = df[df['content'].apply(lambda x: len(x) <= 1000000)].copy()
print('no. of articles after dropping long articles: ', len(df))

print(df.info())

no. articles dropped:  127
no. of articles:  368920
no. of articles after dropping long articles:  368918
<class 'pandas.core.frame.DataFrame'>
Int64Index: 368918 entries, 0 to 369046
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   author      181789 non-null  object        
 1   date        368918 non-null  datetime64[ns]
 2   domain      368918 non-null  object        
 3   title       368833 non-null  object        
 4   url         368918 non-null  object        
 5   content     368918 non-null  object        
 6   topic_area  368918 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 22.5+ MB
None


In [4]:
df.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            369037, 369038, 369039, 369040, 369041, 369042, 369043, 369044,
            369045, 369046],
           dtype='int64', length=368918)

In [5]:
# Get index numbers of articles to label
np.random.seed(777)
article_indices = np.random.choice(df.index, size=1000, replace=False)
article_indices

array([272801, 200452, 360615, 348240, 150912,   5919,  30166, 136359,
       208183,  23000, 330542,  58596, 286411, 261343, 113511,  42006,
       102479, 241927, 341037, 291682, 212144, 302846, 110625, 201788,
       170802, 159684, 333166, 220772, 237146, 129492, 313847, 156620,
        36890, 340034, 163144, 360941,  55550, 347657,  47979, 182837,
       134691, 160045, 286915, 278396, 233365, 286431,  83397, 166869,
       246232, 308226,  84557, 203388, 170380, 213355, 158274,  76833,
       156025, 264723, 117743,  94017,  47533,  19442, 356015, 167369,
       157731, 221842, 126826, 194484, 140473, 190886, 286245, 165741,
       243097, 225028, 116166, 202311,  50746, 323505,  24801, 306356,
       278877, 232675,  35740,  62652, 148690, 266123, 204738, 115954,
        23313, 190061, 141887, 187155, 171153, 169507, 354893, 160755,
       187739,  89199, 326662, 129575,  41302, 264705, 229588, 249778,
       245717, 174209, 152067, 205808,   7392, 190235, 107862, 276782,
      

In [6]:
# Extract only articles for labelling
df_lab = df.loc[article_indices,:].copy()

In [7]:
df_lab

Unnamed: 0,author,date,domain,title,url,content,topic_area
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business
348240,Joe Hoppe,2020-11-23,marketwatch,"Wynnstay Properties profit halves, lifts dividend",https://www.marketwatch.com/story/wynnstay-pro...,Wynnstay Properties PLC said Monday that its f...,business
150912,,2020-06-11,finance.yahoo,U.K. Scientists Defy Johnson to Speak Out on V...,https://ca.finance.yahoo.com/news/johnson-unde...,(Bloomberg) -- U.K. Prime Minister Boris Johns...,business
...,...,...,...,...,...,...,...
169245,By Reuters,2020-06-24,nytimes,"With No Tourists to Watch Migration, Kenyan Op...",https://www.nytimes.com/reuters/2020/06/24/bus...,NAIROBI — For Kenyan-based safari operator Saf...,business
306948,,2020-09-12,marketscreener,"Pfizer, BioNTech propose expanding COVID-19 va...",https://www.marketscreener.com/news/latest/Pfi...,Pfizer Inc and BioNTech SE\non Saturday propos...,business
111840,,2020-05-13,marketscreener,Ameresco : Announces Commercial Operations of ...,https://www.marketscreener.com/AMERESCO-INC-64...,Beale Hill Wind Farm is the first renewable ge...,business
72139,,2020-04-16,marketscreener,"L'Oréal: News release: ""First Quarter 2020 Sales""",https://www.marketscreener.com/quote/stock/L-O...,"Clichy, 16 April 2020 at 6:00 p.m. First Quart...",business


In [8]:
df_lab.loc[:,'org_names_listed'] = ''
df_lab.loc[:,'org_names_listed_uo'] = ''
df_lab.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,,


In [9]:
ner_processor = en_core_web_sm.load()

In [None]:
['AFP', 'Associated Press', 'BBC', 'Bloomberg', 'CNN', 'Daily Express', 'Ducker Frontier', 'Financial Times', 'Guardian', 'Marketwatch', 'NBC', 
 'Prudent Speculator', 'Reuters', 'RNZ', 'Sky News', 'SpreadEx', 'The Daily Mail', 'The Sun', 'The Canadian Press', 'LBC'] # Reuters also listed

['Aberdeen Standard Investments', 'Australian Bureau of Statistics', 'Commonwealth Financial Network', 'Federal Reserve', 'National Health Service', 
 'National Rail', 'New York Stock Exchange' 'NHS', 'Oxford University', 'Scientific Advisory Group for Emergencies', 'Securities and Exchange Commission', 
 'The Bank of England', 'The British Retail Consortium', 'The Office for Budget Responsibility', 'The Organisation for Economic Co-operation and Development', 
 'The Reserve Bank of Australia', 'The Vaccine Group', 'Toronto Stock Exchange', 'University of Plymouth', 'Eastspring Investments', 'NATO', 'FTSE', 'SEC', 
 'CDC']

In [221]:
num = 99
article_index = article_indices[num]
print(article_index)
print(df_lab.loc[article_index]['org_names_listed'])

article = df_lab.loc[article_index]['content']
spacy.displacy.render(ner_processor(article), style='ent', options = {"ents": ["ORG"]})

129575



In [222]:
#article_index = 129575
listed_orgs =   ['Kennedy-Wilson Holdings, Inc.']
listed_orgs_uo =   ['Kennedy Wilson']

df_lab.at[article_index,'org_names_listed'] = listed_orgs
df_lab.at[article_index,'org_names_listed_uo'] = listed_orgs_uo

In [223]:
df_lab.loc[article_indices[90:100]]

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo
141887,,2020-06-03,finance.yahoo,Edited Transcript of HQY earnings conference c...,https://www.finance.yahoo.com/news/edited-tran...,Q1 2021 HealthEquity Inc Earnings Call Draper ...,business,[],[]
187155,Chloe Taylor@ChloeTaylor141,2020-07-06,cnbc,Immunity to coronavirus is 'fragile' and 'shor...,https://www.cnbc.com/2020/07/06/immunity-to-co...,"It is not a ""safe bet"" to rely on immunity to ...",finance,[],[]
171153,by John Hackston,2020-06-25,hbr,How Different Personality Types Cope with an A...,https://www.hbr.org/2020/06/how-different-pers...,Technology has enabled many people to work fro...,business,[],[]
169507,,2020-06-24,marketscreener,Red White & Bloom Brands : & Bloom Investee Ph...,https://www.marketscreener.com/RED-WHITE-BLOOM...,"TORONTO, June 24, 2020 (GLOBE NEWSWIRE) -- R...",business,[],[]
354893,Sam Haysom,2020-12-03,mashable,Every moment of John Oliver's diabolical long-...,https://mashable.com/article/john-oliver-adam-...,John Oliver loves a running joke. Anyone who's...,tech,[AT&T Inc.],[AT&T]
160755,,2020-06-18,finance.yahoo,National Grid expects £400m Covid hit as elect...,https://uk.finance.yahoo.com/news/national-gri...,The company that helps keep Britain’s lights o...,business,[National Grid plc],[National Grid]
187739,nature,2020-07-06,nature,Gearing up for a ‘monumental’ mission to Mars,http://www.nature.com/articles/d41586-020-01933-1,They call me a mechatronics engineer — a hybri...,science,[],[]
89199,"Humeyra Pamuk, David Brunnstrom",2020-04-29,reuters,Pompeo pushes China to provide access to Wuhan...,https://www.reuters.com/article/us-health-coro...,U.S. Secretary of State Mike Pompeo on Wednesd...,business,[],[]
326662,Rebekah Evans,2020-10-03,express,Furlough fraud: Warning as claims for scheme e...,https://www.express.co.uk/finance/personalfina...,We will use your email address only for sendi...,general,[],[]
129575,,2020-05-26,marketscreener,Kennedy Wilson : Announces Virtual 2020 Annual...,https://www.marketscreener.com/KENNEDY-WILSON-...,Global real estate investment company Kennedy ...,business,"[Kennedy-Wilson Holdings, Inc.]",[Kennedy Wilson]


In [14]:
#df_lab.loc[article_index]

In [224]:
date_label = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")
file_name = 'df_lab_' + date_label + '.csv'
print(file_name)
df_lab.to_csv('./data/' + file_name)

df_lab_2021_07_11_21_20.csv


In [233]:
file_latest = 'df_lab_2021_07_11_21_20.csv'
df_check = pd.read_csv('./data/' + file_latest, index_col=0)
df_check

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[]
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[]
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,['Tesco PLC'],['Tesco']
348240,Joe Hoppe,2020-11-23,marketwatch,"Wynnstay Properties profit halves, lifts dividend",https://www.marketwatch.com/story/wynnstay-pro...,Wynnstay Properties PLC said Monday that its f...,business,['Wynnstay Properties Plc'],['Wynnstay Properties PLC']
150912,,2020-06-11,finance.yahoo,U.K. Scientists Defy Johnson to Speak Out on V...,https://ca.finance.yahoo.com/news/johnson-unde...,(Bloomberg) -- U.K. Prime Minister Boris Johns...,business,[],[]
...,...,...,...,...,...,...,...,...,...
169245,By Reuters,2020-06-24,nytimes,"With No Tourists to Watch Migration, Kenyan Op...",https://www.nytimes.com/reuters/2020/06/24/bus...,NAIROBI — For Kenyan-based safari operator Saf...,business,,
306948,,2020-09-12,marketscreener,"Pfizer, BioNTech propose expanding COVID-19 va...",https://www.marketscreener.com/news/latest/Pfi...,Pfizer Inc and BioNTech SE\non Saturday propos...,business,,
111840,,2020-05-13,marketscreener,Ameresco : Announces Commercial Operations of ...,https://www.marketscreener.com/AMERESCO-INC-64...,Beale Hill Wind Farm is the first renewable ge...,business,,
72139,,2020-04-16,marketscreener,"L'Oréal: News release: ""First Quarter 2020 Sales""",https://www.marketscreener.com/quote/stock/L-O...,"Clichy, 16 April 2020 at 6:00 p.m. First Quart...",business,,


In [234]:
df_check.loc[article_indices[:100]]

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[]
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[]
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,['Tesco PLC'],['Tesco']
348240,Joe Hoppe,2020-11-23,marketwatch,"Wynnstay Properties profit halves, lifts dividend",https://www.marketwatch.com/story/wynnstay-pro...,Wynnstay Properties PLC said Monday that its f...,business,['Wynnstay Properties Plc'],['Wynnstay Properties PLC']
150912,,2020-06-11,finance.yahoo,U.K. Scientists Defy Johnson to Speak Out on V...,https://ca.finance.yahoo.com/news/johnson-unde...,(Bloomberg) -- U.K. Prime Minister Boris Johns...,business,[],[]
...,...,...,...,...,...,...,...,...,...
160755,,2020-06-18,finance.yahoo,National Grid expects £400m Covid hit as elect...,https://uk.finance.yahoo.com/news/national-gri...,The company that helps keep Britain’s lights o...,business,['National Grid plc'],['National Grid']
187739,nature,2020-07-06,nature,Gearing up for a ‘monumental’ mission to Mars,http://www.nature.com/articles/d41586-020-01933-1,They call me a mechatronics engineer — a hybri...,science,[],[]
89199,"Humeyra Pamuk, David Brunnstrom",2020-04-29,reuters,Pompeo pushes China to provide access to Wuhan...,https://www.reuters.com/article/us-health-coro...,U.S. Secretary of State Mike Pompeo on Wednesd...,business,[],[]
326662,Rebekah Evans,2020-10-03,express,Furlough fraud: Warning as claims for scheme e...,https://www.express.co.uk/finance/personalfina...,We will use your email address only for sendi...,general,[],[]
