# Process Named Entity Recognition on Article Text & Article Titles

### Imports

In [18]:
!which jupyter
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm
import re

# NER Imports
import spacy
import en_core_web_sm
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Set up ner processor from SpaCy
ner_processor = en_core_web_sm.load()

/home/ubuntu/thesis_env2/bin/jupyter


In [2]:
df = pd.read_csv('./data/covid19_articles_20201231.csv')

In [3]:
# date to datetime
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 369047 entries, 0 to 369046
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   author      181799 non-null  object        
 1   date        369047 non-null  datetime64[ns]
 2   domain      369047 non-null  object        
 3   title       368962 non-null  object        
 4   url         369047 non-null  object        
 5   content     369047 non-null  object        
 6   topic_area  369047 non-null  object        
dtypes: datetime64[ns](1), object(6)
memory usage: 19.7+ MB


In [4]:
df.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business


## NER Pre-Processing & Tagging Functions

In [5]:
# Drop long articles - cannot be processed by Spacy - articles where text longer than 1,000,000 characters

def drop_long_articles(df):
    num_articles_pre_drop = len(df)
    print('no. of articles: ',len(df))
    df = df[df['content'].apply(lambda x: len(x) <= 1000000)]
    num_articles_post_drop = len(df)
    print(f'no. of articles after dropping long articles: {len(df)}, no. articles dropped: {num_articles_pre_drop - num_articles_post_drop}') 
    return df

# NER Tagging

def ner_tagging(df, row_index_start):
    df_ner = df.copy()
    col_index = df_ner.columns.get_loc('org_names')

    for row in tqdm(range(len(df_ner.iloc[row_index_start:]))):
        row = row + row_index_start
        pos_tagged = ner_processor(df_ner.iloc[row]['content'])
        org_names = set([X.text for X in pos_tagged.ents if X.label_ == 'ORG'])
        df_ner.iat[row, col_index] = sorted(org_names)

        # Save every 10,000 articles processed
        if row % 10000 == 0:
            df_ner.to_pickle('./data/ner/df_ner_210912.pickle')

    # Save df after tagging
    df_ner.to_pickle('./data/ner/df_ner_210912.pickle')
    
    return df_ner

def ner_tagging_titles(df, row_index_start):

    col_index = df.columns.get_loc('org_names_titles')

    for row in tqdm(range(len(df.iloc[row_index_start:]))):
        row = row + row_index_start
        if not pd.isna(df.iloc[row]['title']):
            pos_tagged = ner_processor(df.iloc[row]['title'])
            org_names = set([X.text for X in pos_tagged.ents if X.label_ == 'ORG'])
            df.iat[row, col_index] = sorted(org_names)
        else:
            df.iat[row, col_index] = ""

        # Save every 10,000 articles processed
        if row % 50000 == 0:
            df.to_pickle('./data/ner/df_ner_titles.pickle')

    # Save df after tagging
    df.to_pickle('./data/ner/df_ner_titles.pickle')
    
    return df

### Tag Full Data Set

In [6]:
# NER Tagging - takes about 15 hours for full dataset

tag_full_data = False
part_processing = False

if tag_full_data:
    
    if part_processing:
        df_ner = pd.read_pickle('./data/ner/df_ner_210717.pickle') # Load part processed file
        row_label_start = df_ner[df_ner['org_names'] == ""].index[0]
        row_index_start = df_ner.index.get_loc(row_label_start) # Start point for continued processing
        df_ner = ner_tagging(df_ner, row_index_start) # For part processed file
        
    else:
        row_index_start = 0
        df = clean_text(df)
        df = drop_long_articles(df)
        df_ner = df.copy()
        df_ner['org_names'] = ''
        df_ner = ner_tagging(df_ner, row_index_start)
        
    df_ner.to_pickle('./data/ner/df_ner_210912.pickle')
else:
    df_ner = pd.read_pickle('./data/ner/df_ner_210912.pickle')

In [43]:
# NER Tagging - titles - 30 minutes

tag_full_data = False

if tag_full_data:
    row_index_start = 0
    df_ner_titles = df_ner.copy()
    df_ner_titles['org_names_titles'] = ''
    df_ner_titles = ner_tagging_titles(df_ner_titles, row_index_start)
    df_ner_titles.to_pickle('./data/ner/df_ner_tagged_articles_titles.pickle')
else:
    df_ner = pd.read_pickle('./data/ner/df_ner_tagged_articles_titles.pickle')

In [44]:
df_ner

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names,org_names_titles
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[BA, Boeing, CAT, CMI, Caterpillar, Cummins In...",[]
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business,"[ADP, AKA, CTAS, Cintas, Cloud-Based Services ...",[]
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business,"[Credit Suisse, FactSet, Ford, NYSE, Tesla, We...",[]
3,Roberto Torres,2020-01-03,ciodive,"On the road to AI adoption, execs grapple with...",https://www.ciodive.com/news/ai-adoption-execs...,CIOs kicked off 2019 with AI as an item to wat...,tech,"[AI, CIO, Gartner, Gartner's, H&M, IBM, IBM Data]",[AI]
4,Alden Wicker,2020-01-06,instyle,Red Carpet Sustainability After Coronavirus Sh...,https://www.instyle.com/fashion/red-carpet-cor...,When the coronavirus pandemic is over and life...,consumer,"[11:30pm PST, BAFTA, CDC, Chanel, Instagram, L...",[Red Carpet Sustainability]
...,...,...,...,...,...,...,...,...,...
369042,Polly Harrison,2020-12-31,thefintechtimes,A Human Touch Will Be a Competitive Edge After...,https://thefintechtimes.com/53867-2/,Niels Pedersen is a Chartered Accountant and S...,finance,"[ATM, BBC Radio Scotland, C. Hoare & Co., Lloy...",[]
369043,,2020-12-31,marketscreener,Datametrex AI : Announces Deploying NexaSecuri...,https://www.marketscreener.com/quote/stock/DAT...,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,business,"[Artificial Intelligence, Artificial Intellige...",[Datametrex AI]
369044,Polly Harrison,2020-12-31,thefintechtimes,"US Payments: Smart Pension, Episode Six, PAAY ...",https://thefintechtimes.com/us-payments-smart-...,"This December, The Fintech Times is asking ind...",finance,"[EMV, Jodan Ledford, PAAY, PEP, PR & Editorial...","[PAAY, Smart Pension]"
369045,,2020-12-31,marketscreener,"WESTWATER RESOURCES, INC. : Entry into a Mater...",https://www.marketscreener.com/quote/stock/WES...,Item 1.01Entry into a Material Definitive Agre...,business,"[Energy\nCorp., Glimpses, Neutron Energy, Inc....","[Financial Statements and, INC, WESTWATER RESO..."


## Clean Tagged Names Functions
- Remove leading and trailing non-letter characters e.g. ' % 2,4 ARCU Aircraft Leasing Limited' -> 'ARCU Aircraft Leasing Limited' Note: this would also drop 3M but companies with number as first letter are very rare
- Remove names that are only 1 or 2 letters
- Remove names found in org_stopwords list

In [19]:
# Load company name stopwords

def load_stopwords(file_path, sheet_name):    
    org_stopwords = pd.read_excel(filepath_stopwords, sheet_name = sheet_name, header = None)
    org_stopwords = list(set(sorted(org_stopwords[0])))
    new_stopwords = ['company']
    org_stopwords = org_stopwords + new_stopwords
    org_stopwords = sorted([word.lower() for word in org_stopwords])
    print('number of phrases in org stopwords list: ', len(org_stopwords))
    return org_stopwords

# Clean Tagged Names

def clean_tagged_names(df, org_stopwords, col_name):

    df_ner_clean = df.copy()
    col_index = df_ner_clean.columns.get_loc(col_name)

    for idx, org_names in tqdm(enumerate(df_ner_clean[col_name])):

        # remove leading and trailing non-letter characters
        org_names = set([re.sub('^[^a-zA-Z]*|[^a-zA-Z]*$','',name) for name in org_names])

        # remove names that are less than 4 letters
        org_names = set([name.lower() for name in org_names if len(name) > 3])

        # exclude names in org_stopwords list - difference between 2 sets
        org_names = org_names - set(org_stopwords) 
        df_ner_clean.iat[idx, col_index] = sorted(org_names)
        
    return df_ner_clean

# Remove found names that are a substring of a longer name also found

def remove_substring_names(df, col_name):
    
    df = df.copy()
    col_index = df.columns.get_loc(col_name)

    for idx, org_names in tqdm(enumerate(df[col_name])):
        #print(org_names)
        #print(type(org_names))
        org_names_new = []
        for name in org_names:
            #print('name is', name, name in org_names)
            org_names_check = set(org_names) - set([name])
            #print(org_names_check)
            if not any(name in string for string in org_names_check):
                org_names_new.append(name)
        #print('org_names_new', org_names_new)
        df.iat[idx, col_index] = sorted(org_names_new)
    
    return df

In [45]:
# Clean Articles

# Load stopwords
filepath_stopwords = './data/ner/company_stopwords.xlsx'
sheet_name = 'all'

# Process labelled data set 
process_labelled_data = False

if process_labelled_data:
    org_stopwords = load_stopwords(filepath_stopwords, sheet_name)
    df_lab_ner = pd.read_pickle('./data/ner/df_lab_ner.pickle')
    df_lab_ner_clean = clean_tagged_names(df_lab_ner, org_stopwords, 'org_names')
    #df_lab_ner_clean = remove_substring_names(df_lab_ner_clean, 'org_names')
    df_lab_ner_clean.to_pickle('./data/ner/df_lab_ner_clean.pickle')
else:
    df_lab_ner_clean = pd.read_pickle('./data/ner/df_lab_ner_clean.pickle')
    
# Process full data set 
process_full_data = False

if process_full_data:
    org_stopwords = load_stopwords(filepath_stopwords, sheet_name)
    df_ner = pd.read_pickle('./data/ner/df_ner_tagged_articles_titles.pickle')
    df_ner_clean = clean_tagged_names(df_ner, org_stopwords, 'org_names')
    #df_ner_clean = remove_substring_names(df_ner_clean, 'org_names')
    df_ner_clean.to_pickle('./data/ner/df_ner_clean_articles_211120.pickle')
else:
    df_ner_clean = pd.read_pickle('./data/ner/df_ner_clean_articles_211120.pickle')
    print(len(df_ner_clean))

number of phrases in org stopwords list:  141


369045it [00:20, 17658.27it/s]


In [46]:
# Clean Titles

# Load stopwords
filepath_stopwords = './data/ner/company_stopwords.xlsx'
sheet_name = 'all'

process_full_data = False

if process_full_data:
    org_stopwords = load_stopwords(filepath_stopwords, sheet_name)
    df_ner_titles = pd.read_pickle('./data/ner/df_ner_clean_articles_211120.pickle')    
    df_ner_clean = clean_tagged_names(df_ner_titles, org_stopwords, 'org_names_titles')
    #df_ner_clean = remove_substring_names(df_ner_clean, 'org_names_titles')
    df_ner_clean.to_pickle('./data/ner/df_ner_clean_articles_titles_211120.pickle')
else:
    df_ner_clean = pd.read_pickle('./data/ner/df_ner_clean_articles_titles_211120.pickle')
    print(len(df_ner_clean))

number of phrases in org stopwords list:  141


369045it [00:12, 28735.15it/s]


In [47]:
df_ner_clean

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names,org_names_titles
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[boeing, caterpillar, cummins inc, deere & com...",[]
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business,"[cintas, cloud-based services for employers ar...",[]
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business,"[credit suisse, factset, ford, tesla, wedbush ...",[]
3,Roberto Torres,2020-01-03,ciodive,"On the road to AI adoption, execs grapple with...",https://www.ciodive.com/news/ai-adoption-execs...,CIOs kicked off 2019 with AI as an item to wat...,tech,"[gartner, gartner's, ibm data]",[]
4,Alden Wicker,2020-01-06,instyle,Red Carpet Sustainability After Coronavirus Sh...,https://www.instyle.com/fashion/red-carpet-cor...,When the coronavirus pandemic is over and life...,consumer,"[bafta, chanel, instagram, léa seydoux’s, new ...",[red carpet sustainability]
...,...,...,...,...,...,...,...,...,...
369042,Polly Harrison,2020-12-31,thefintechtimes,A Human Touch Will Be a Competitive Edge After...,https://thefintechtimes.com/53867-2/,Niels Pedersen is a Chartered Accountant and S...,finance,"[a chartered accountant and senior lecturer, b...",[]
369043,,2020-12-31,marketscreener,Datametrex AI : Announces Deploying NexaSecuri...,https://www.marketscreener.com/quote/stock/DAT...,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,business,"[artificial intelligence, artificial intellige...",[datametrex ai]
369044,Polly Harrison,2020-12-31,thefintechtimes,"US Payments: Smart Pension, Episode Six, PAAY ...",https://thefintechtimes.com/us-payments-smart-...,"This December, The Fintech Times is asking ind...",finance,"[jodan ledford, paay, paya, pr & editorials, p...","[paay, smart pension]"
369045,,2020-12-31,marketscreener,"WESTWATER RESOURCES, INC. : Entry into a Mater...",https://www.marketscreener.com/quote/stock/WES...,Item 1.01Entry into a Material Definitive Agre...,business,"[energy\ncorp, glimpses, neutron energy, inc.'...","[financial statements and, westwater resources]"


### General queries/checking on tagged names

In [19]:
import itertools 

# List of all org names
ner_org_names = sorted(set(list(itertools.chain(*[names for names in df_ner_clean['org_names']]))))
df_ner_org_names = pd.DataFrame(ner_org_names)
df_ner_org_names.to_pickle('./data/df_ner_org_names.pickle')
len(ner_org_names)

957660

In [20]:
#df_ner[df_ner['org_names'].apply(lambda x: 'guaranty agreement' in x)]
#df_ner_clean[df_ner_clean['org_names'].apply(lambda x: 'a\ncredit and guaranty agreement' in x)]

In [21]:
#ner_org_names[::-1]
ner_org_names[:5]

['" & RSA Public/Private-Key',
 '" - Department of Economic Affairs',
 '" - Ministry of Finance',
 '" Arakan Army',
 '" BUSINESS REVIEW Office Software and']

### Import listed companies

Import listed companies from pre compiled list and reduce to countries of interest

In [20]:
def load_listed_companies(file_path, sheet_name, countries):

    companies_list = pd.read_excel(file_path, sheet_name = sheet_name)

    # Drop companies names that are 2 or 3 characters long
    companies_list = companies_list.drop(labels = companies_list[companies_list['name'].str.len() < 4].index)

    # Countries of interest
    companies_list = companies_list[companies_list['country'].isin(countries)]
    
    return companies_list

In [21]:
# Import listed companies
filepath_companies = './data/ner/company_names_listed.xlsx'
sheet_name = 'company_names'
countries_included = ['United States', 'Canada', 'Australia', 'United Kingdom']

companies_list = load_listed_companies(filepath_companies, sheet_name, countries_included)

print(companies_list['country'].value_counts())
print(len(companies_list))

United States     8848
Canada            3405
Australia         1873
United Kingdom    1424
Name: country, dtype: int64
15550


In [22]:
companies_list.head(3)

Unnamed: 0,id,name,common_names,ticker_symbol,country,industry,subindustry
0,35931,"The Ultimate Software Group, Inc.",,ULTI,United States,Information Technology,Software
1,35908,"U.S. Personnel, Inc.",,USPE,United States,Industrials,Professional Services
2,203152,e.Digital Corporation,,EDIG,United States,Information Technology,"Technology Hardware, Storage and Peripherals"


### Most common words in listed companies names

Use in matching algorithm

In [23]:
from collections import Counter
results = Counter()
companies_list['name'].str.lower().str.split().apply(results.update)
companies_list_common_words = sorted([k for k, v in results.items() if v > 99])

In [24]:
#print(results)

In [25]:
print(companies_list_common_words)

['&', 'american', 'and', 'bancorp,', 'bank', 'capital', 'company', 'corp.', 'corporation', 'energy', 'energy,', 'exploration', 'financial', 'first', 'global', 'gold', 'group', 'group,', 'holdings', 'holdings,', 'inc.', 'incorporated', 'international', 'international,', 'investment', 'limited', 'ltd', 'ltd.', 'metals', 'minerals', 'mining', 'of', 'oil', 'pharmaceuticals,', 'plc', 'resources', 'services', 'systems,', 'technologies', 'technologies,', 'technology', 'the', 'therapeutics,', 'trust', 'ventures']


### Checking similarity of NER word against company names in company list

Rule 1 
- When the NER is only 1 word  
- The comparison is reduced to the set of common words  
- When the similarity ratio is higher than 90% it is accepted

Rule 2
- When the NER is more than 1 word
- When part of the NER exists in the stop words list e.g. 'Capital One Financial Corp', financial and corp are words in the stop words list
- When the similarity ratio is higher than 95% it is accepted

Rule 3
- Remaining words
- Similarity ratio on best matching substring being 90% and higher

In [26]:
def match_companies(df, companies_list_common_words, companies_list, file_name='df_ner_temp'):
    
    df = df.copy()
    df['filtered_names'] = ''
    df['filtered_names_match'] = ''
    col_index_fn = df.columns.get_loc('filtered_names')
    col_index_fnm = df.columns.get_loc('filtered_names_match')
    #print_on = True
    print_on = False

    for idx, names in tqdm(enumerate(df['org_names'])):
        if print_on:
            print('\n', idx, names, 'labels: ', df.iat[idx,col_index_onl])
        filtered_names = []
        filtered_names_match = []
        for name in names:

            # Rule 1 - NER name only 1 word -> set of common words  -> similarity ratio on set
            if len(name.split(' ')) == 1:
                processor = lambda x: set(x.lower().split()) - set(companies_list_common_words)
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor, scorer=fuzz.token_sort_ratio, score_cutoff=98)
                #best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.ratio, score_cutoff=75)
                if best_match:
                    if print_on:
                        print('rule1: ', name, '-', best_match, best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])            

            # Rule 2 - NER name longer than 1 word plus part of name in stopwords list -> similarity ratio on set
            elif set(name.split(' ')).intersection(org_stopwords):
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.token_set_ratio, score_cutoff=95)
                if best_match:
                    if print_on:
                        print('rule2: ', name, '-', best_match[0], best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])

            else:
                best_match = process.extractOne(query=name, choices=companies_list['name'], scorer=fuzz.partial_ratio, score_cutoff=96)
                if best_match:
                    if print_on:
                        print(f'best match: {best_match}')
                        print('rule3: ', name, '-', best_match[0], best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])

        df.iat[idx,col_index_fn] = sorted(list(set(filtered_names)))
        df.iat[idx,col_index_fnm] = sorted(list(set(filtered_names_match)))
        
        #if idx % 5000 == 0:
         #   df.to_pickle('./data/' + file_name + '.pickle')
    
    return df

def match_companies_titles(df, file_name='df_ner_temp'):
    
    df = df.copy()
    df['filtered_names_titles'] = ''
    df['filtered_names_match_titles'] = ''
    col_index_fn = df.columns.get_loc('filtered_names_titles')
    col_index_fnm = df.columns.get_loc('filtered_names_match_titles')
    print_on = True
    #print_on = False

    for idx, names in tqdm(enumerate(df['org_names_titles'])):
        if print_on:
            print('\n', idx, names, 'labels: ', df.iat[idx,col_index_onl])
        filtered_names = []
        filtered_names_match = []
        for name in names:

            # Rule 1 - NER name only 1 word -> set of common words  -> similarity ratio on set
            if len(name.split(' ')) == 1:
                processor = lambda x: set(x.lower().split()) - set(companies_list_common_words)
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor, scorer=fuzz.token_sort_ratio, score_cutoff=98)
                #best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.ratio, score_cutoff=75)
                if best_match:
                    if print_on:
                        print('rule1: ', name, '-', best_match, best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])            

            # Rule 2 - NER name longer than 1 word plus part of name in stopwords list -> similarity ratio on set
            elif set(name.split(' ')).intersection(org_stopwords):
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.token_set_ratio, score_cutoff=95)
                if best_match:
                    if print_on:
                        print('rule2: ', name, '-', best_match[0], best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])

            else:
                processor_2 = lambda x: x.lower()
                best_match = process.extractOne(query=name, choices=companies_list['name'], processor=processor_2, scorer=fuzz.partial_ratio, score_cutoff=96)
                if best_match:
                    if print_on:
                        print(f'best match: {best_match}')
                        print('rule3: ', name, '-', best_match[0], best_match[1])
                    filtered_names.append(name)
                    filtered_names_match.append(best_match[0])

        df.iat[idx,col_index_fn] = sorted(list(set(filtered_names)))
        df.iat[idx,col_index_fnm] = sorted(list(set(filtered_names_match)))
        
        #if idx % 5000 == 0:
         #   df.to_pickle('./data/' + file_name + '.pickle')
    
    return df

In [27]:
#df_lab_ner_clean

### Process company name matching

In [28]:
# total processing will take 7 days so processed in stages using 

def process_ner(full_data, partial_data, first_pass, articles_to_process, logger):
    if full_data:
        df_ner_clean = pd.read_pickle('./data/ner/df_ner_clean_articles_titles_211120.pickle')
        df_ner_matched = match_companies(df_ner_clean, companies_list_common_words, companies_list)
        df_ner_matched.to_pickle('./data/ner/df_ner_matched.pickle')  
    elif partial_data:
        if first_pass:
            df_ner_matched = pd.read_pickle('./data/ner/df_ner_clean_articles_titles_211120.pickle') # only for first time
            start_index = 0
        else:
            df_ner_matched = pd.read_pickle('./data/ner/df_ner_matched_211120.pickle')
            start_index = df_ner_matched[df_ner_matched['filtered_names_match'].isna()].iloc[0].name
            print(f'start index name: {start_index}')
            start_index = df_ner_matched.index.get_loc(start_index)
            print(f'start index loc: {start_index}')
        end_index = start_index + articles_to_process
        print(f'end index: {end_index}')
        logger.info(f"start index: {start_index}, end_index: {end_index}")
        df_ner_matched_subset = df_ner_matched[start_index:end_index]
        df_ner_matched_subset = match_companies(df_ner_matched_subset, companies_list_common_words, companies_list)
        df_ner_matched_new = df_ner_matched[:start_index].append(df_ner_matched_subset).append(df_ner_matched[end_index:])
        df_ner_matched_new.to_pickle('./data/ner/df_ner_matched_211120.pickle')
    else:
        df_ner_matched = pd.read_pickle('./data/ner/df_ner_matched_211120.pickle')
        
def process_ner_titles(full_data, partial_data, first_pass, articles_to_process, logger):
    if full_data:
        df_ner_clean = pd.read_pickle('./data/ner/df_ner_matched_211120.pickle')
        df_ner_matched = match_companies(df_ner_clean, file_name='df_ner_matched_titles')
        df_ner_matched.to_pickle('./data/ner/df_ner_matched_articles_titles.pickle')  
    elif partial_data:
        if first_pass:
            df_ner_matched = pd.read_pickle('./data/ner/df_ner_clean_titles.pickle') # only for first time
            start_index = 0
        else:
            df_ner_matched = pd.read_pickle('./data/ner/df_ner_matched_titles_211030.pickle')
            start_index = df_ner_matched[df_ner_matched['filtered_names_titles_match'].isna()].iloc[0].name
            print(f'start index name: {start_index}')
            start_index = df_ner_matched.index.get_loc(start_index)
            print(f'start index loc: {start_index}')
        end_index = start_index + articles_to_process
        print(f'end index: {end_index}')
        logger.info(f"start index: {start_index}, end_index: {end_index}")
        df_ner_matched_subset = df_ner_matched[start_index:end_index]
        df_ner_matched_subset = match_companies_titles(df_ner_matched_subset)
        df_ner_matched_new = df_ner_matched[:start_index].append(df_ner_matched_subset).append(df_ner_matched[end_index:])
        df_ner_matched_new.to_pickle('./data/ner/df_ner_matched_articles_titles.pickle')
    else:
        df_ner_matched = pd.read_pickle('./data/ner/df_ner_matched_articles_titles.pickle')

process_matching = False
full_data = False
partial_data = False
articles_to_process = 100
first_pass = False
num_runs = 2

if process_matching:
    for n in range(num_runs):
        print(f'run number {n + 1} of {num_runs}')
        logger.info(f"run number {n + 1} of {num_runs}")
        process_ner(full_data, partial_data, first_pass, articles_to_process, logger)
        #process_ner_titles(full_data, partial_data, first_pass, articles_to_process, logger)
else:
    df_ner_matched = pd.read_pickle('./data/ner/df_ner_matched_211120.pickle')
    #df_ner_matched = pd.read_pickle('./data/ner/df_ner_matched_articles_titles.pickle')

In [54]:
df_ner_matched

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names,org_names_titles,filtered_names,filtered_names_match
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[boeing, caterpillar, cummins inc, deere & com...",[],"[boeing, caterpillar, cummins inc, deere & com...","[Caterpillar Inc., Cummins Inc., Deere & Compa..."
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business,"[cintas, cloud-based services for employers ar...",[],[cintas],[Cintas Corporation]
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business,"[credit suisse, factset, ford, tesla, wedbush ...",[],[tesla],"[Tesla, Inc.]"
3,Roberto Torres,2020-01-03,ciodive,"On the road to AI adoption, execs grapple with...",https://www.ciodive.com/news/ai-adoption-execs...,CIOs kicked off 2019 with AI as an item to wat...,tech,"[gartner, gartner's, ibm data]",[],[gartner],"[Gartner, Inc.]"
4,Alden Wicker,2020-01-06,instyle,Red Carpet Sustainability After Coronavirus Sh...,https://www.instyle.com/fashion/red-carpet-cor...,When the coronavirus pandemic is over and life...,consumer,"[bafta, chanel, instagram, léa seydoux’s, new ...",[red carpet sustainability],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
369042,Polly Harrison,2020-12-31,thefintechtimes,A Human Touch Will Be a Competitive Edge After...,https://thefintechtimes.com/53867-2/,Niels Pedersen is a Chartered Accountant and S...,finance,"[a chartered accountant and senior lecturer, b...",[],,
369043,,2020-12-31,marketscreener,Datametrex AI : Announces Deploying NexaSecuri...,https://www.marketscreener.com/quote/stock/DAT...,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,business,"[artificial intelligence, artificial intellige...",[datametrex ai],,
369044,Polly Harrison,2020-12-31,thefintechtimes,"US Payments: Smart Pension, Episode Six, PAAY ...",https://thefintechtimes.com/us-payments-smart-...,"This December, The Fintech Times is asking ind...",finance,"[jodan ledford, paay, paya, pr & editorials, p...","[paay, smart pension]",,
369045,,2020-12-31,marketscreener,"WESTWATER RESOURCES, INC. : Entry into a Mater...",https://www.marketscreener.com/quote/stock/WES...,Item 1.01Entry into a Material Definitive Agre...,business,"[energy\ncorp, glimpses, neutron energy, inc.'...","[financial statements and, westwater resources]",,


#### Data Review

In [21]:
# Titles Check

df_ner_matched_titles = pd.read_pickle('./data/df_ner_matched_titles_211030.pickle')
print(len(df_ner_matched_titles))

In [28]:
print(len(df_ner_matched_titles))
print(len(df_ner_matched))

369045
369045


In [22]:
df_ner_matched_titles.head(1).append(df_ner_matched_titles.tail(1))

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names,org_names_titles,filtered_names_titles,filtered_names_titles_match
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[BA, Boeing, CAT, CMI, Caterpillar, Cummins In...",[],[],[]
369046,,2020-12-31,marketscreener,FTSE 100 wraps up worst year since 2008 financ...,https://www.marketscreener.com/quote/index/FTS...,"The FTSE 100 lost 1.5%, with consumer stocks, ...",business,"[AJ Bell, British American Tobacco, Connells L...",[],[],[]


In [23]:
check_titles = df_ner_matched_titles.explode("filtered_names_titles_match")
mask = check_titles['filtered_names_titles_match'].notna()
check_titles[mask]

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names,org_names_titles,filtered_names_titles,filtered_names_titles_match
46,Chris Markoch,2020-01-16,marketbeat,Why Morgan Stanley’s Earnings Report Matters,https://www.marketbeat.com/originals/why-morga...,"On January 16, Morgan Stanley (NYSE:MS) was th...",business,"[Gorman’s, Instacart Inc., Investment Strategi...",[Morgan Stanley’s Earnings Report],[Morgan Stanley’s Earnings Report],Morgan Stanley
55,Sunny Kim@sunny_newsiee,2020-01-17,cnbc,CDC and Homeland Security begins screening for...,https://www.cnbc.com/2020/01/17/cdc-and-homela...,U.S. health and Homeland Security officials wi...,finance,"[CDC, Homeland Security, The Centers for Disea...","[CDC, Coronavirus, Homeland Security]",[Homeland Security],Homeland Security Corporation
161,Chris Markoch,2020-01-21,marketbeat,Why Bad News May be Good News for Boeing,https://www.marketbeat.com/originals/why-bad-n...,There is a saying that bad news comes in three...,business,"[Airbus, BA, BAC, Bank of America, Boeing, CNB...",[Boeing],[Boeing],The Boeing Company
167,Maggie Fitzgerald@mkmfitzgerald,2020-01-21,cnbc,Stocks making the biggest moves midday: Morgan...,https://www.cnbc.com/2020/01/21/stocks-making-...,Check out the companies making headlines in mi...,finance,"[Akdag, CNBC, Citigroup, Costco, Halliburton, ...","[Morgan Stanley, Tesla]","[Morgan Stanley, Tesla]",Morgan Stanley
167,Maggie Fitzgerald@mkmfitzgerald,2020-01-21,cnbc,Stocks making the biggest moves midday: Morgan...,https://www.cnbc.com/2020/01/21/stocks-making-...,Check out the companies making headlines in mi...,finance,"[Akdag, CNBC, Citigroup, Costco, Halliburton, ...","[Morgan Stanley, Tesla]","[Morgan Stanley, Tesla]","Tesla, Inc."
...,...,...,...,...,...,...,...,...,...,...,...
369033,,2020-12-31,marketscreener,Pioneer Natural Resources Company : Announces ...,https://www.marketscreener.com/quote/stock/PIO...,Pioneer Natural Resources Company Announces Co...,business,"[LLC, NYSE, Offers, PXD, Parsley, Parsley Ener...","[LLC, Parsley Energy, Parsley Finance Corp., P...","[Parsley Energy, Pioneer Natural Resources Com...",Pioneer Natural Resources Company
369036,,2020-12-31,marketscreener,GLOBAL HEALTHCARE REIT : MANAGEMENT'S DISCUSSI...,https://www.marketscreener.com/quote/stock/GLO...,"? macroeconomic conditions, such as a prolonge...",business,"[ALF, CFO Zvi Rhine's, Co-Borrowers, Company, ...",[GLOBAL HEALTHCARE REIT],[GLOBAL HEALTHCARE REIT],"Global Healthcare REIT, Inc."
369038,,2020-12-31,marketscreener,NextEra Energy : FPL accepting applications fo...,https://www.marketscreener.com/quote/stock/NEX...,"On Jan. 4, Florida Power & Light Company will ...",business,"[Care to Share Program, FPL, FPSC, Florida Pow...","[FPL, Main Street Recovery Credit Program, Nex...",[NextEra Energy],"NextEra Energy, Inc."
369043,,2020-12-31,marketscreener,Datametrex AI : Announces Deploying NexaSecuri...,https://www.marketscreener.com/quote/stock/DAT...,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,business,"[Artificial Intelligence, Artificial Intellige...",[Datametrex AI],[Datametrex AI],Datametrex AI Limited


In [45]:
print(df_ner_matched[1188:1200].iloc[0]['filtered_names'])
print(df_ner_matched[1188:1200].iloc[0]['filtered_names_match'])

['AT&T', 'Advanced Micro Devices', 'Apple', 'Boeing', 'Dow', 'EBay', 'General Electric', 'L Brands', "McDonald's", 'Penn National', 'Penn National Gaming', 'Starbucks', 'Xilinx']
['AT&T Inc.', 'Advanced Micro Devices, Inc.', 'Apple Inc.', 'Dow Inc.', 'General Electric Company', 'Ignite International Brands, Ltd.', "McDonald's Corporation", 'Penn National Gaming, Inc.', 'Starbucks Corporation', 'The Boeing Company', 'Xilinx, Inc.', 'eBay Inc.']


In [31]:
# Add title companies to df

#df_ner_matched['filtered_names_titles_match'] = df_ner_matched_titles['filtered_names_titles_match']
#df_ner_matched.to_pickle('./data/df_ner_matched_210913.pickle')  
df_ner_matched = pd.read_pickle('./data/df_ner_matched_210913.pickle')  

In [32]:
df_ner_matched

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names,filtered_names,filtered_names_match,filtered_names_titles_match
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,"[BA, Boeing, CAT, CMI, Caterpillar, Cummins In...","[Boeing, Caterpillar, Cummins Inc, Deere & Com...","[Caterpillar Inc., Cummins Inc., Deere & Compa...",[]
1,Thomas Hughes,2020-01-03,marketbeat,Labor Stocks Are Going To Break Out In 2020,https://www.marketbeat.com/originals/labor-sto...,The labor markets were one of the most closely...,business,"[ADP, AKA, CTAS, Cintas, Cloud-Based Services ...",[Cintas],[Cintas Corporation],[]
2,Steve Anderson,2020-01-03,marketbeat,"Tesla (TSLA) Breaks Shipment Record, Beats Est...",https://www.marketbeat.com/originals/teal-brea...,"It could be forgiven, that some might think th...",business,"[Credit Suisse, FactSet, Ford, NYSE, Tesla, We...",[Tesla],"[Tesla, Inc.]",[]
3,Roberto Torres,2020-01-03,ciodive,"On the road to AI adoption, execs grapple with...",https://www.ciodive.com/news/ai-adoption-execs...,CIOs kicked off 2019 with AI as an item to wat...,tech,"[AI, CIO, Gartner, Gartner's, H&M, IBM, IBM Data]",[Gartner],"[Gartner, Inc.]",[]
4,Alden Wicker,2020-01-06,instyle,Red Carpet Sustainability After Coronavirus Sh...,https://www.instyle.com/fashion/red-carpet-cor...,When the coronavirus pandemic is over and life...,consumer,"[11:30pm PST, BAFTA, CDC, Chanel, Instagram, L...",[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
369042,Polly Harrison,2020-12-31,thefintechtimes,A Human Touch Will Be a Competitive Edge After...,https://thefintechtimes.com/53867-2/,Niels Pedersen is a Chartered Accountant and S...,finance,"[ATM, BBC Radio Scotland, C. Hoare & Co., Lloy...",[Lloyds Bank],[Lloyds Banking Group plc],[]
369043,,2020-12-31,marketscreener,Datametrex AI : Announces Deploying NexaSecuri...,https://www.marketscreener.com/quote/stock/DAT...,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,business,"[Artificial Intelligence and Machine Learning,...","[Company, Datametrex AI Limited]","[American International Holdings Corp., Datame...",[Datametrex AI Limited]
369044,Polly Harrison,2020-12-31,thefintechtimes,"US Payments: Smart Pension, Episode Six, PAAY ...",https://thefintechtimes.com/us-payments-smart-...,"This December, The Fintech Times is asking ind...",finance,"[EMV, Jodan Ledford, PAAY, PEP, PR & Editorial...",[],[],[]
369045,,2020-12-31,marketscreener,"WESTWATER RESOURCES, INC. : Entry into a Mater...",https://www.marketscreener.com/quote/stock/WES...,Item 1.01Entry into a Material Definitive Agre...,business,"[Energy\nCorp., Glimpses, Neutron Energy, Inc....","[Energy\nCorp., Westwater Resources, Inc.]","[American International Holdings Corp., Westwa...","[Westwater Resources, Inc.]"


# Labelled Data Testing

### Tag Labelled File

In [29]:
tag_labelled_data = False

if tag_labelled_data:

    df_lab = pd.read_csv('./data/ner/df_lab_latest.csv', index_col=0, converters={'org_names_listed': eval, 'org_names_listed_uo': eval, 'listing_country': eval})
    df_lab = clean_text(df_lab)
    df_lab = drop_long_articles(df_lab)
    df_lab_ner = ner_tagging(df_lab, 0)
    df_lab_ner.to_pickle('./data/ner/df_lab_ner.pickle')
    #df_lab_ner.to_csv('./data/df_lab_ner.csv')

else:
    df_lab_ner = pd.read_pickle('./data/ner/df_lab_ner.pickle')

In [30]:
df_lab_ner.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,0,[],0,"[Inter Milan, Jose Mourinho’s, Lopetegui, Manc..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,0,[],0,"[Childs, Premiership Rugby, the British & Iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,1,[United Kingdom],1,"[BBC, BRC, EU, Sharma, Sky News, Tesco, The Br..."


### Clean Labelled Names

In [31]:
# Load stopwords
filepath_stopwords = './data/ner/company_stopwords.xlsx'
sheet_name = 'all'

# Process labelled data set 
process_labelled_data = False

if process_labelled_data:
    org_stopwords = load_stopwords(filepath_stopwords, sheet_name)
    df_lab_ner = pd.read_pickle('./data/ner/df_lab_ner.pickle')
    df_lab_ner_clean = clean_tagged_names(df_lab_ner, org_stopwords, 'org_names')
    #df_lab_ner_clean = remove_substring_names(df_lab_ner_clean, 'org_names')
    df_lab_ner_clean.to_pickle('./data/ner/df_lab_ner_clean.pickle')
else:
    df_lab_ner_clean = pd.read_pickle('./data/ner/df_lab_ner_clean.pickle')

In [32]:
df_lab_ner_clean.head(3)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listed_uo_count,listing_country,listing_country_count,org_names
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,0,[],0,"[inter milan, jose mourinho’s, lopetegui, roma..."
200452,,2020-07-11,finance.yahoo,Gallagher Premiership teams will not be punish...,https://uk.finance.yahoo.com/news/gallagher-pr...,Gallagher Premiership clubs will have flexibil...,business,[],[],0,0,[],0,"[childs, premiership rugby, the british & iris..."
360615,,2020-12-14,marketscreener,"Don't stockpile food, minister tells British s...",https://www.marketscreener.com/quote/currency/...,"Britain and the EU agreed on Sunday to ""go the...",business,[Tesco PLC],[Tesco],1,1,[United Kingdom],1,"[sharma, tesco]"


### Drop countries not interested in from labelled file

Labelled data file has listed companies from many countries, drop those countries not being considered

In [33]:
# Drop countries not interested in from labelled file

def drop_countries_labelled_file(df):
    
    df_lab_ner_clean = df.copy()
    
    col_index_lc = df_lab_ner_clean.columns.get_loc('listing_country')
    col_index_onl = df_lab_ner_clean.columns.get_loc('org_names_listed')
    col_index_onluo = df_lab_ner_clean.columns.get_loc('org_names_listed_uo')
    col_index_lc2 = df_lab_ner_clean.columns.get_loc('listed_count')

    for idx in range(len(df_lab_ner_clean)):
        org_names_listed = []
        org_names_listed_uo = []
        countries_to_add = []

        for idx_c, country in enumerate(df_lab_ner_clean.iloc[idx, col_index_lc]):
            if country in countries_included:
                listed_name = df_lab_ner_clean.iat[idx, col_index_onl][idx_c]
                listed_name_uo = df_lab_ner_clean.iat[idx, col_index_onluo][idx_c]
                org_names_listed.append(listed_name)
                org_names_listed_uo.append(listed_name_uo)
                countries_to_add.append(country)

        df_lab_ner_clean.iat[idx, col_index_onl] = org_names_listed
        df_lab_ner_clean.iat[idx, col_index_onluo] = org_names_listed_uo
        df_lab_ner_clean.iat[idx, col_index_lc] = countries_to_add
        df_lab_ner_clean.iat[idx, col_index_lc2] = len(org_names_listed)
    
    df_lab_ner_clean = df_lab_ner_clean.drop(['listed_uo_count', 'listing_country_count'], axis=1)
    
    return df_lab_ner_clean

In [34]:
# Process country drop
process_country_drop = False

if process_country_drop:
    df_lab_ner_clean = pd.read_pickle('./data/ner/df_lab_ner_clean.pickle')
    df_lab_ner_clean = drop_countries_labelled_file(df_lab_ner_clean)

In [35]:
df_lab_ner_clean.iloc[-1]['org_names']

['astrazeneca azn',
 'bntx',
 'fauci',
 'johns hopkins university',
 'johnson & johnson jnj',
 'merck & co. merk',
 'the national institute of allergy and infectious diseases']

### Check Matches and Calculate Performance Functions

In [36]:
def check_matches(df):
    
    df['count_matches'] = ''
    df['count_lab_not_found'] = ''
    df['count_found_not_lab'] = ''
    col_index_onm = df.columns.get_loc('org_names_listed')
    col_index_fnm = df.columns.get_loc('filtered_names_match')
    col_index_cm = df.columns.get_loc('count_matches')
    col_index_clnf = df.columns.get_loc('count_lab_not_found')
    col_index_cfnl = df.columns.get_loc('count_found_not_lab')

    for idx in range(len(df)):
        # True Positives
        count_matches = len(list(set(df.iloc[idx,col_index_onm]).intersection(df.iloc[idx,col_index_fnm])))
        # False Negatives
        count_lab_not_found = len(list(set(df.iloc[idx,col_index_onm]).difference(df.iloc[idx,col_index_fnm]))) # labelled but not found in NER
        # False Positives
        count_found_not_lab = len(list(set(df.iloc[idx,col_index_fnm]).difference(df.iloc[idx,col_index_onm]))) # found in NER but not labelled

        df.iat[idx, col_index_cm] = count_matches
        df.iat[idx, col_index_clnf] = count_lab_not_found
        df.iat[idx, col_index_cfnl] = count_found_not_lab
    
    return df

In [37]:
# Calculate performance

def calculate_performance(df):

    true_pos = sum(df['count_matches'])
    false_neg = sum(df['count_lab_not_found'])
    false_pos = sum(df['count_found_not_lab'])

    precision = true_pos / (true_pos + false_pos) # low number means lots of false positives
    recall = true_pos / (true_pos + false_neg) # low number means lots of false negatives
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    return true_pos, false_neg, false_pos, precision, recall, f1_score

In [43]:
labelled_data = False

filepath_stopwords = './data/ner/company_stopwords.xlsx'
sheet_name = 'all'

if labelled_data:
    org_stopwords = load_stopwords(filepath_stopwords, sheet_name)
    df_lab_ner_matched = match_companies(df_lab_ner_clean, companies_list_common_words, companies_list)
    df_lab_ner_matched = check_matches(df_lab_ner_matched)
    true_pos, false_neg, false_pos, precision, recall, f1_score = calculate_performance(df_lab_ner_matched)
    print(true_pos, false_neg, false_pos)
    print(f'precision = {round(precision,2)}')
    print(f'recall = {round(recall,2)}')
    print(f'f1 score = {round(f1_score,2)}')
    df_lab_ner_matched.to_pickle('./data/ner/df_lab_ner_matched.pickle')
    

In [47]:
df_lab_ner_matched = pd.read_pickle('./data/ner/df_lab_ner_matched_210802.pickle')

In [48]:
df_lab_ner_matched.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,org_names_listed,org_names_listed_uo,listed_count,listing_country,org_names,filtered_names,filtered_names_match,count_matches,count_lab_not_found,count_found_not_lab
272801,,2020-08-15,finance.yahoo,Julen Lopetegui: We must be at our best to bea...,https://finance.yahoo.com/news/julen-lopetegui...,Head coach Julen Lopetegui knows Sevilla will ...,business,[],[],0,[],"[inter milan, jose mourinho’s, lopetegui, roma...",[],[],0,0,0


In [49]:
true_pos, false_neg, false_pos, precision, recall, f1_score = calculate_performance(df_lab_ner_matched)
print(true_pos, false_neg, false_pos)
print(f'precision = {round(precision,2)}')
print(f'recall = {round(recall,2)}')
print(f'f1 score = {round(f1_score,2)}')

520 308 261
precision = 0.67
recall = 0.63
f1 score = 0.65
