# Process Named Entity Recognition on Article Text

### Imports

In [1]:
!which jupyter
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm
import re

# NER Imports
import spacy
import en_core_web_sm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Set up ner processor from SpaCy
ner_processor = en_core_web_sm.load()

/home/ubuntu/thesis_env2/bin/jupyter


In [2]:
df = pd.read_csv('./data/covid19_articles_20201231.csv')

In [3]:
# date to datetime
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369047 entries, 0 to 369046
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   author      181799 non-null  object        
 1   date        369047 non-null  datetime64[ns]
 2   domain      369047 non-null  object        
 3   title       368962 non-null  object        
 4   url         369047 non-null  object        
 5   content     369047 non-null  object        
 6   topic_area  369047 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 19.7+ MB


In [4]:
df.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business


### NER Pre Processing & Tagging Functions

In [5]:
# Remove text from articles

def clean_text(df):
    search_text = "follow us on Facebook, Twitter, and Instagram" #2423
    print(len(df[df['content'].str.contains(search_text)]))
    df['content'] = df['content'].str.replace(search_text, "")
    search_text = "Follow Yahoo Finance on Twitter, Facebook, Instagram, Flipboard, LinkedIn, and reddit"
    print(len(df[df['content'].str.contains(search_text)]))
    df['content'] = df['content'].str.replace(search_text, "")
    search_text = "find us on Facebook and follow us on Twitter"
    print(len(df[df['content'].str.contains(search_text)]))
    df['content'] = df['content'].str.replace(search_text, "")
    search_text = "on Twitter, Facebook, LinkedIn, Instagram and YouTube"
    print(len(df[df['content'].str.contains(search_text)]))
    df['content'] = df['content'].str.replace(search_text, "")
    search_text = "Twitter - Facebook - Instagram"
    print(len(df[df['content'].str.contains(search_text)]))
    df['content'] = df['content'].str.replace(search_text, "")
    search_text = "earnings conference call | transcript"
    print(len(df[df['title'].str.contains(search_text, na=False)]))
    df['content'] = df['title'].str.replace(search_text, "")
    return df

# Drop long articles - cannot be processed by Spacy - articles where text longer than 1,000,000 characters

def drop_long_articles(df):
    num_articles_pre_drop = len(df)
    print('no. of articles: ',len(df))
    df = df[df['content'].apply(lambda x: len(x) <= 1000000)]
    num_articles_post_drop = len(df)
    print(f'no. of articles after dropping long articles: {len(df)}, no. articles dropped: {num_articles_pre_drop - num_articles_post_drop}') 
    return df

# NER Tagging

def ner_tagging(df, row_index_start):
    df_ner = df.copy()
    df_ner['org_names'] = ''
    col_index = df_ner.columns.get_loc('org_names')

    for row in tqdm(range(len(df_ner.iloc[row_index_start:]))):
        row = row + row_index_start
        pos_tagged = ner_processor(df_ner.iloc[row]['content'])
        org_names = set([X.text for X in pos_tagged.ents if X.label_ == 'ORG'])
        df_ner.iat[row, col_index] = sorted(org_names)

        # Save every 10,000 articles processed
        if row % 10000 == 0:
            df_ner.to_pickle('./data/df_ner.pickle')
            df_ner.to_csv('./data/df_ner.csv')

    # Save df after tagging
    df_ner.to_pickle('./data/df_ner.pickle')
    df_ner.to_csv('./data/df_ner.csv')
    
    return df_ner

### Tag Full Data Set

In [6]:
# NER Tagging - takes about 15 hours for full dataset

tag_full_data = False
part_processing = False

if tag_full_data:
    
    if part_processing:
        df_ner = pd.read_pickle('./data/df_ner_210717.pickle') # Load part processed file
        row_label_start = df_ner[df_ner['org_names'] == ""].index[0]
        row_index_start = df_ner.index.get_loc(row_label_start) # Start point for continued processing
        df_ner = ner_tagging(df_ner, row_index_start) # For part processed file
        
    else:
        row_index_start = 0
        df = clean_text(df)
        df = drop_long_articles(df)
        df_ner = ner_tagging(df, row_index_start)
        
    df_ner.to_pickle('./data/df_ner.pickle')
    df_ner.to_csv('./data/df_ner.csv')

### Tag Labelled File

In [8]:
tag_labelled_data = True

if tag_labelled_data:

    df_lab = pd.read_csv('./data/df_lab_latest.csv', index_col=0, converters={'org_names_listed': eval, 'org_names_listed_uo': eval, 'listing_country': eval})
    df_lab = clean_text(df_lab)
    df_lab = drop_long_articles(df_lab)
    df_lab_ner = ner_tagging(df_lab, 0)
    
    df_lab_ner.to_pickle('./data/df_lab_ner.pickle')
    df_lab_ner.to_csv('./data/df_lab_ner.csv')

  0%|          | 0/1000 [00:00<?, ?it/s]

4
0
0
1
1
4
no. of articles:  1000
no. of articles after dropping long articles: 1000, no. articles dropped: 0


100%|██████████| 1000/1000 [02:41<00:00,  6.18it/s]


In [9]:
df_lab_ner.head(6)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,0,[],0,"[Inter Milan, Jose Mourinho’s, Lopetegui, Manc..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,0,[],0,"[Childs, Premiership Rugby, the British & Iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,1,[United Kingdom],1,"[BBC, BRC, EU, Sharma, Sky News, Tesco, The Br..."
348240,Joe Hoppe,2020-11-23,marketwatch,"Wynnstay Properties profit halves, lifts dividend",https://www.marketwatch.com/story/wynnstay-pro...,Wynnstay Properties PLC said Monday that its f...,business,[Wynnstay Properties Plc],[Wynnstay Properties PLC],1,1,[United Kingdom],1,[Wynnstay Properties PLC]
150912,,2020-06-11,finance.yahoo,U.K. Scientists Defy Johnson to Speak Out on V...,https://ca.finance.yahoo.com/news/johnson-unde...,(Bloomberg) -- U.K. Prime Minister Boris Johns...,business,[],[],0,0,[],0,"[Conservative Party, Dominic Cummings, Ferguso..."
5919,Barbara Kollmeyer,2020-02-18,marketwatch,Apple Warning Rattles European Tech Stocks,https://www.marketwatch.com/articles/apple-war...,AFP via Getty Images European stocks came unde...,business,"[Apple Inc., Dialog Semiconductor Plc, ASM Int...","[Apple, Dialog Semiconductor, ASM Internationa...",6,6,"[United States, United Kingdom, Netherlands, U...",6,"[AFP, ASM International, Apple, HSBC, Intesa S..."


### Clean Tagged Names Functions
- Remove leading and trailing non-letter characters e.g. ' % 2,4 ARCU Aircraft Leasing Limited' -> 'ARCU Aircraft Leasing Limited' Note: this would also drop 3M but companies with number as first letter are very rare
- Remove names that are only 1 or 2 letters
- Remove names found in org_stopwords list

In [6]:
# Load company name stopwords

def load_stopwords(file_path, sheet_name):    
    org_stopwords = pd.read_excel(filepath_stopwords, sheet_name = sheet_name, header = None)
    org_stopwords = list(set(sorted(org_stopwords[0])))
    new_stopwords = ['company']
    org_stopwords = org_stopwords + new_stopwords
    org_stopwords = sorted([word.lower() for word in org_stopwords])
    print('number of phrases in org stopwords list: ', len(org_stopwords))
    return org_stopwords

# Clean Tagged Names

def clean_tagged_names(df, org_stopwords):

    df_ner_clean = df.copy()
    col_index = df_ner_clean.columns.get_loc('org_names')

    for idx, org_names in tqdm(enumerate(df_ner_clean['org_names'])):

        # remove leading and trailing non-letter characters
        org_names = set([re.sub('^[^a-zA-Z]*|[^a-zA-Z]*$','',name) for name in org_names])

        # remove names that are less than 4 letters
        org_names = set([name.lower() for name in org_names if len(name) > 3])

        # exclude names in org_stopwords list - difference between 2 sets
        org_names = org_names - set(org_stopwords) 
        df_ner_clean.iat[idx, col_index] = sorted(org_names)
        
    return df_ner_clean

# Remove found names that are a substring of a longer name also found

def remove_substring_names(df):
    
    df = df.copy()
    col_index = df.columns.get_loc('org_names')

    for idx, org_names in tqdm(enumerate(df['org_names'])):
        #print(org_names)
        #print(type(org_names))
        org_names_new = []
        for name in org_names:
            #print('name is', name, name in org_names)
            org_names_check = set(org_names) - set([name])
            #print(org_names_check)
            if not any(name in string for string in org_names_check):
                org_names_new.append(name)
        #print('org_names_new', org_names_new)
        df.iat[idx, col_index] = sorted(org_names_new)
    
    return df

In [7]:
df_ner_test = pd.read_pickle('./data/df_ner_clean.pickle')
df_ner_test.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,0,[],0,"[inter milan, jose mourinho’s, lopetegui, roma..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,0,[],0,"[childs, premiership rugby, the british & iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,1,[United Kingdom],1,"[sharma, tesco]"


In [8]:
df_ner_test[:3]

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,0,[],0,"[inter milan, jose mourinho’s, lopetegui, roma..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,0,[],0,"[childs, premiership rugby, the british & iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,1,[United Kingdom],1,"[sharma, tesco]"


In [9]:
remove_substring_names(df_ner_test[8:10])

2it [00:00, 2306.46it/s]


Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
208183,,2020-07-15,finance.yahoo,Authentic morning line favorite in Haskell at ...,https://uk.finance.yahoo.com/news/authentic-mo...,Belmont Stakes runner-up Dr Post and Santa Ani...,business,[],[],0,0,[],0,"[churchill downs, dr post, fame to famous, leb..."
23000,Minnie Wright,2020-03-16,express,Lewis Capaldi Aberdeen: Fans branded 'p****s' ...,https://www.express.co.uk/entertainment/music/...,Lewis Capaldi’s tour arrived in Aberdeen last...,general,[],[],0,0,[],0,"[capaldi’s, scottish professional football lea..."


In [22]:
# Load stopwords
filepath_stopwords = './data/company_stopwords.xlsx'
sheet_name = 'all'
org_stopwords = load_stopwords(filepath_stopwords, sheet_name)

# Process labelled data set 
process_labelled_data = True

if process_labelled_data:
    df_lab_ner = pd.read_pickle('./data/df_lab_ner.pickle')
    df_lab_ner_clean = clean_tagged_names(df_lab_ner, org_stopwords)
    df_lab_ner_clean = remove_substring_names(df_lab_ner_clean)
    df_lab_ner_clean.to_pickle('./data/df_lab_ner_clean.pickle')
    
# Process full data set 
process_full_data = False

if process_full_data:
    df_ner = pd.read_pickle('./data/df_ner.pickle')
    df_ner_clean = clean_tagged_names(df_ner, org_stopwords)
    df_ner_clean = remove_substring_names(df_ner, org_stopwords)
    df_ner_clean.to_pickle('./data/df_ner_clean.pickle')

1000it [00:00, 13137.29it/s]
1000it [00:00, 13619.11it/s]

number of phrases in org stopwords list:  141





In [11]:
df_lab_ner_clean.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,0,[],0,"[inter milan, jose mourinho’s, lopetegui, roma..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,0,[],0,"[childs, premiership rugby, the british & iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,1,[United Kingdom],1,"[sharma, tesco]"


In [12]:
if process_full_data:
    df_ner_clean.head(3)

### General queries/checking on tagged names

In [13]:
import itertools 

# List of all org names
ner_org_names = sorted(set(list(itertools.chain(*[names for names in df_ner_clean['org_names']]))))
df_ner_org_names = pd.DataFrame(ner_org_names)
df_ner_org_names.to_pickle('./data/df_ner_org_names.pickle')
len(ner_org_names)

NameError: name 'df_ner_clean' is not defined

In [10]:
#df_ner[df_ner['org_names'].apply(lambda x: 'guaranty agreement' in x)]
#df_ner_clean[df_ner_clean['org_names'].apply(lambda x: 'a\ncredit and guaranty agreement' in x)]

In [11]:
#ner_org_names[::-1]
ner_org_names[:5]

['a (excellent',
 'a biologics license application (bla',
 'a chartered accountant',
 'a comissão de auditoria manuel ramos de sousa sebastião estela de magalhães',
 'a department of health and social care (dhsc']

### Import listed companies

Import listed companies from pre compiled list and reduce to countries of interest

In [14]:
def load_listed_companies(file_path, sheet_name, countries):

    companies_list = pd.read_excel(file_path, sheet_name = sheet_name)

    # Drop companies names that are 2 or 3 characters long
    companies_list = companies_list.drop(labels = companies_list[companies_list['name'].str.len() < 4].index)

    # Countries of interest
    companies_list = companies_list[companies_list['country'].isin(countries)]
    
    return companies_list

In [15]:
# Import listed companies
filepath_companies = './data/company_names_listed.xlsx'
sheet_name = 'company_names'
countries_included = ['United States', 'Canada', 'Australia', 'United Kingdom']

companies_list = load_listed_companies(filepath_companies, sheet_name, countries_included)

print(companies_list['country'].value_counts())
print(len(companies_list))

United States     8848
Canada            3405
Australia         1873
United Kingdom    1424
Name: country, dtype: int64
15550


In [16]:
companies_list.head(3)

Unnamed: 0,id,name,common_names,ticker_symbol,country,industry,subindustry
0,35931,"The Ultimate Software Group, Inc.",,ULTI,United States,Information Technology,Software
1,35908,"U.S. Personnel, Inc.",,USPE,United States,Industrials,Professional Services
2,203152,e.Digital Corporation,,EDIG,United States,Information Technology,"Technology Hardware, Storage and Peripherals"


### Most common words in listed companies names

Use in matching algorithm

In [17]:
from collections import Counter
results = Counter()
companies_list['name'].str.lower().str.split().apply(results.update)
companies_list_common_words = sorted([k for k, v in results.items() if v > 99])

In [18]:
#print(results)

In [19]:
print(companies_list_common_words)

['&', 'american', 'and', 'bancorp,', 'bank', 'capital', 'company', 'corp.', 'corporation', 'energy', 'energy,', 'exploration', 'financial', 'first', 'global', 'gold', 'group', 'group,', 'holdings', 'holdings,', 'inc.', 'incorporated', 'international', 'international,', 'investment', 'limited', 'ltd', 'ltd.', 'metals', 'minerals', 'mining', 'of', 'oil', 'pharmaceuticals,', 'plc', 'resources', 'services', 'systems,', 'technologies', 'technologies,', 'technology', 'the', 'therapeutics,', 'trust', 'ventures']


### Drop countries not interested in from labelled file

Labelled data file has listed companies from many countries, drop those countries not being considered

In [20]:
# Drop countries not interested in from labelled file

def drop_countries_labelled_file(df):
    
    df_lab_ner_clean = df.copy()
    
    col_index_lc = df_lab_ner_clean.columns.get_loc('listing_country')
    col_index_onl = df_lab_ner_clean.columns.get_loc('org_names_listed')
    col_index_onluo = df_lab_ner_clean.columns.get_loc('org_names_listed_uo')
    col_index_lc2 = df_lab_ner_clean.columns.get_loc('listed_count')

    for idx in range(len(df_lab_ner_clean)):
        org_names_listed = []
        org_names_listed_uo = []
        countries_to_add = []

        for idx_c, country in enumerate(df_lab_ner_clean.iloc[idx, col_index_lc]):
            if country in countries_included:
                listed_name = df_lab_ner_clean.iat[idx, col_index_onl][idx_c]
                listed_name_uo = df_lab_ner_clean.iat[idx, col_index_onluo][idx_c]
                org_names_listed.append(listed_name)
                org_names_listed_uo.append(listed_name_uo)
                countries_to_add.append(country)

        df_lab_ner_clean.iat[idx, col_index_onl] = org_names_listed
        df_lab_ner_clean.iat[idx, col_index_onluo] = org_names_listed_uo
        df_lab_ner_clean.iat[idx, col_index_lc] = countries_to_add
        df_lab_ner_clean.iat[idx, col_index_lc2] = len(org_names_listed)
    
    df_lab_ner_clean = df_lab_ner_clean.drop(['listed_uo_count', 'listing_country_count'], axis=1)
    
    return df_lab_ner_clean

In [23]:
# Process country drop
process_country_drop = True

if process_country_drop:
    df_lab_ner_clean = pd.read_pickle('./data/df_lab_ner_clean.pickle')
    df_lab_ner_clean = drop_countries_labelled_file(df_lab_ner_clean)

In [27]:
df_lab_ner_clean.iloc[-1]['org_names']

['astrazeneca azn',
 'bntx',
 'fauci',
 'johns hopkins university',
 'johnson & johnson jnj',
 'merck & co. merk',
 'the national institute of allergy and infectious diseases']

### Checking similarity of NER word against company names in company list

Rule 1 
- When the NER is only 1 word  
- The comparison is reduced to the set of common words  
- When the similarity ratio is higher than 90% it is accepted

Rule 2
- When the NER is more than 1 word
- When part of the NER exists in the stop words list e.g. 'Capital One Financial Corp', financial and corp are words in the stop words list
- When the similarity ratio is higher than 95% it is accepted

Rule 3
- Remaining words
- Similarity ratio on best matching substring being 90% and higher

In [28]:
def match_companies(df, file_name='df_ner_temp'):
    
    df = df.copy()
    df['filtered_names'] = ''
    df['filtered_names_match'] = ''
    col_index_fn = df.columns.get_loc('filtered_names')
    col_index_fnm = df.columns.get_loc('filtered_names_match')
    #print_on = True
    print_on = False

    for idx, names in tqdm(enumerate(df['org_names'])):
        if print_on:
            print('\n', idx, names, 'labels: ', df.iat[idx,col_index_onl])
        filtered_names = []
        filtered_names_match = []
        for name in names:

            # Rule 1 - NER name only 1 word -> set of common words  -> similarity ratio on set
            if len(name.split(' ')) == 1:
                processor = lambda x: set(x.lower().split()) - set(companies_list_common_words)
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor, scorer=fuzz.token_sort_ratio, score_cutoff=98)
                #best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.ratio, score_cutoff=75)
                if best_match:
                    if print_on:
                        print('rule1: ', name, '-', best_match, best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])            

            # Rule 2 - NER name longer than 1 word plus part of name in stopwords list -> similarity ratio on set
            elif set(name.split(' ')).intersection(org_stopwords):
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.token_set_ratio, score_cutoff=95)
                if best_match:
                    if print_on:
                        print('rule2: ', name, '-', best_match[0], best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])

            else:
                best_match = process.extractOne(query=name, choices=companies_list['name'], scorer=fuzz.partial_ratio, score_cutoff=96)
                if best_match:
                    if print_on:
                        print(f'best match: {best_match}')
                        print('rule3: ', name, '-', best_match[0], best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])

        df.iat[idx,col_index_fn] = list(set(filtered_names))
        df.iat[idx,col_index_fnm] = list(set(filtered_names_match))
        
        if idx % 5000 == 0:
            df.to_pickle('./data/' + file_name + '.pickle')
    
    return df

In [29]:
def check_matches(df):
    
    df['count_matches'] = ''
    df['count_lab_not_found'] = ''
    df['count_found_not_lab'] = ''
    col_index_onm = df.columns.get_loc('org_names_listed')
    col_index_fnm = df.columns.get_loc('filtered_names_match')
    col_index_cm = df.columns.get_loc('count_matches')
    col_index_clnf = df.columns.get_loc('count_lab_not_found')
    col_index_cfnl = df.columns.get_loc('count_found_not_lab')

    for idx in range(len(df)):
        # True Positives
        count_matches = len(list(set(df.iloc[idx,col_index_onm]).intersection(df.iloc[idx,col_index_fnm])))
        # False Negatives
        count_lab_not_found = len(list(set(df.iloc[idx,col_index_onm]).difference(df.iloc[idx,col_index_fnm]))) # labelled but not found in NER
        # False Positives
        count_found_not_lab = len(list(set(df.iloc[idx,col_index_fnm]).difference(df.iloc[idx,col_index_onm]))) # found in NER but not labelled

        df.iat[idx, col_index_cm] = count_matches
        df.iat[idx, col_index_clnf] = count_lab_not_found
        df.iat[idx, col_index_cfnl] = count_found_not_lab
    
    return df

In [30]:
# Calculate performance

def calculate_performance(df):

    true_pos = sum(df['count_matches'])
    false_neg = sum(df['count_lab_not_found'])
    false_pos = sum(df['count_found_not_lab'])

    precision = true_pos / (true_pos + false_pos) # low number means lots of false positives
    recall = true_pos / (true_pos + false_neg) # low number means lots of false negatives
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return true_pos, false_neg, false_pos, precision, recall, f1_score

In [31]:
df_lab_ner_clean

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listing_country,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,[],"[inter milan, jose mourinho’s, lopetegui, roma..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,[],"[childs, premiership rugby, the british & iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,[United Kingdom],"[sharma, tesco]"
348240,Joe Hoppe,2020-11-23,marketwatch,"Wynnstay Properties profit halves, lifts dividend",https://www.marketwatch.com/story/wynnstay-pro...,Wynnstay Properties PLC said Monday that its f...,business,[Wynnstay Properties Plc],[Wynnstay Properties PLC],1,[United Kingdom],[wynnstay properties plc]
150912,,2020-06-11,finance.yahoo,U.K. Scientists Defy Johnson to Speak Out on V...,https://ca.finance.yahoo.com/news/johnson-unde...,(Bloomberg) -- U.K. Prime Minister Boris Johns...,business,[],[],0,[],"[conservative party, dominic cummings, ferguso..."
...,...,...,...,...,...,...,...,...,...,...,...,...
169245,By Reuters,2020-06-24,nytimes,"With No Tourists to Watch Migration, Kenyan Op...",https://www.nytimes.com/reuters/2020/06/24/bus...,NAIROBI — For Kenyan-based safari operator Saf...,business,[],[],0,[],"[chaumont, maasai mara, nairobi tented camp, s..."
306948,,2020-09-12,marketscreener,"Pfizer, BioNTech propose expanding COVID-19 va...",https://www.marketscreener.com/news/latest/Pfi...,Pfizer Inc and BioNTech SE\non Saturday propos...,business,[Pfizer Inc.],[Pfizer Inc],1,[United States],[the u.s food and drug\nadministration]
111840,,2020-05-13,marketscreener,Ameresco : Announces Commercial Operations of ...,https://www.marketscreener.com/AMERESCO-INC-64...,Beale Hill Wind Farm is the first renewable ge...,business,"[Ameresco, Inc.]","[Ameresco, Inc.]",1,[United States],"[about ameresco, ameresco executive, ameresco ..."
72139,,2020-04-16,marketscreener,"L'Oréal: News release: ""First Quarter 2020 Sales""",https://www.marketscreener.com/quote/stock/L-O...,"Clichy, 16 April 2020 at 6:00 p.m. First Quart...",business,[],[],0,[],"[appendix, autorit, banco alimentare, columbia..."


### Process company name matching

In [32]:
labelled_data = True

if labelled_data:
    df_lab_ner_matched = match_companies(df_lab_ner_clean)
    df_lab_ner_matched = check_matches(df_lab_ner_matched)
    true_pos, false_neg, false_pos, precision, recall, f1_score = calculate_performance(df_lab_ner_matched)
    print(true_pos, false_neg, false_pos)
    print(f'precision = {round(precision,2)}')
    print(f'recall = {round(recall,2)}')
    print(f'f1 score = {round(f1_score,2)}')
    df_lab_ner_matched.to_pickle('./data/df_lab_ner_matched.pickle')
    

company']
company']
1000it [30:54,  1.85s/it]

520 308 261
precision = 0.67
recall = 0.63
f1 score = 0.65





In [None]:
full_data = False

if full_data:
    df_ner_matched = match_companies(df_ner_clean, file_name='df_ner_matched')
    

In [39]:
df_lab_ner_matched[0:50]#.head(20)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listing_country,org_names,filtered_names,filtered_names_match,count_matches,count_lab_not_found,count_found_not_lab
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,[],"[inter milan, jose mourinho’s, lopetegui, roma...",[],[],0,0,0
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,[],"[Childs, Premiership Rugby, the British & Iris...",[],[],0,0,0
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,[United Kingdom],"[BBC, BRC, EU, Sharma, Sky News, Tesco, The Br...",[],[],0,1,0
348240,Joe Hoppe,2020-11-23,marketwatch,"Wynnstay Properties profit halves, lifts dividend",https://www.marketwatch.com/story/wynnstay-pro...,Wynnstay Properties PLC said Monday that its f...,business,[Wynnstay Properties Plc],[Wynnstay Properties PLC],1,[United Kingdom],[Wynnstay Properties PLC],[Wynnstay Properties PLC],[Wynnstay Properties Plc],1,0,0
150912,,2020-06-11,finance.yahoo,U.K. Scientists Defy Johnson to Speak Out on V...,https://ca.finance.yahoo.com/news/johnson-unde...,(Bloomberg) -- U.K. Prime Minister Boris Johns...,business,[],[],0,[],"[Conservative Party, Dominic Cummings, Ferguso...",[],[],0,0,0
5919,Barbara Kollmeyer,2020-02-18,marketwatch,Apple Warning Rattles European Tech Stocks,https://www.marketwatch.com/articles/apple-war...,AFP via Getty Images European stocks came unde...,business,"[Apple Inc., Dialog Semiconductor Plc, HSBC Ho...","[Apple, Dialog Semiconductor, HSBC]",3,"[United States, United Kingdom, United Kingdom]","[AFP, ASM International, Apple, HSBC, Intesa S...",[],[],0,3,0
30166,Georgina Laud,2020-03-20,express,Coronavirus and paracetamol: Can you take para...,https://www.express.co.uk/life-style/health/12...,"Coronavirus cases are growing daily, and in t...",general,[],[],0,[],"[Daily Express, NEXT WEEK, NHS, National Rail,...",[National Rail],[Canadian National Railway Company],0,0,1
136359,,2020-05-30,finance.yahoo,Coronavirus: UK national debt nears £2tn for f...,https://uk.finance.yahoo.com/news/coronavirus-...,The cost of the coronavirus pandemic is mounti...,business,[],[],0,[],"[Budget, OBR, OECD, The Organisation for Econo...",[],[],0,0,0
208183,,2020-07-15,finance.yahoo,Authentic morning line favorite in Haskell at ...,https://uk.finance.yahoo.com/news/authentic-mo...,Belmont Stakes runner-up Dr Post and Santa Ani...,business,[],[],0,[],"[Churchill Downs, Crown, Dr Post, Fame to Famo...",[Churchill Downs],[Churchill Downs Incorporated],0,0,1
23000,Minnie Wright,2020-03-16,express,Lewis Capaldi Aberdeen: Fans branded 'p****s' ...,https://www.express.co.uk/entertainment/music/...,Lewis Capaldi’s tour arrived in Aberdeen last...,general,[],[],0,[],"[Capaldi, Capaldi’s, Daily Express, P&J, Scott...",[],[],0,0,0


In [102]:
print(df_ner_clean.iloc[6000][['content']].values[0])

Half of Samsung's smartphones are now made in Vietnam, where the coronavirus that has crippled the China operations of Apple and many other firms has so far had only a limited impact on its production. Apple said on Monday it would not meet its revenue guidance for the March quarter due to the coronavirus impact on both production and sales in China, where most iPhones are made. Chinese smartphone maker Xiaomi Corp last week also flagged a hit to its March quarter sales. Huawei, another major Samsung rival, has not announced any production problems, but analysts expect it will also be hit hard due to its heavy reliance on Chinese manufacturing and parts. Many Chinese and foreign firms have begun to re-open China factories that were idled for weeks, but shortages of workers and other problems have in many cases kept output to a minimum. Samsung has also largely ceded the China market to its rivals in recent years, meaning it won't suffer from the store closures and drop in demand that i

In [100]:
print(df_ner_clean.iloc[6000][['filtered_names_match']].values[0])

{'Samsung Electronics CO L', 'apple computer, inc.', 'sealand capital galaxy limited', 'apple hospitality reit inc', 'Samsung Electronics Co L', 'apple inc', 'apple inc.', 'samsung electronics co l'}
