In [None]:
import pandas as pd

In [3]:
#load a NER pipeline
import spacy
nlp = spacy.load('en_core_web_trf')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#function to extract company names
def find_org_entities(text:str):
    #text = remove_punctuation(text)
    doc = nlp(text)
    org_list = []
    for word in doc.ents:
        if(word.label_=="ORG"):
            org_list.append(word.text)
    return set(org_list)

In [22]:
#simple function to perform a look-up based entity linking
def entity_linking_simple(org_list):
    annotations_pred = {}
    companies_df = pd.read_json("./company_collection.json")
    companies_df['name_clean'] = companies_df['name'].str.lower()
    companies_df['name_clean'] = companies_df['name_clean'].str.strip()
    for org in org_list:
        org_clean = org.lower().strip()
        if org_clean in companies_df['name_clean'].to_list():
            annotations_pred[org] = companies_df[companies_df.name_clean==org_clean]["url"].to_list()[0]
        else:
            annotations_pred[org] = ""
    return annotations_pred 

In [23]:
df_new = pd.read_json("./news_articles-new.jsonl",lines=True)

In [24]:
annotations_pred_list = []
for _,row in df_new.iterrows():
    org_list = find_org_entities(row['text'])
    annotations_pred =  entity_linking_simple(org_list)
    annotations_pred_list.append(annotations_pred)
df_new['annotations'] = annotations_pred_list

In [25]:
df_new.to_json("news_articles-linked.jsonl",lines=True,orient='records')

In [30]:
df_new.iloc[3]['annotations']

{'FOSSA': 'fossa.io',
 'Index Ventures': '',
 'RethinkDB': 'rethinkdb.com',
 'SnapRoute': '',
 'Microsoft': 'azure.microsoft.com/en-gb/marketplace',
 'Salesforce': '',
 'Bloq': 'bloq.com',
 'Skry': 'skry.me',
 'the Digital Currency Group': '',
 'Confluent': 'beconfluent.com',
 'Crunchbase': 'crunchbase.com',
 'Norwest Venture Partners': '',
 'Mozilla': 'mozilla.com',
 'Blockstack': 'blockstack.org',
 'Magento': 'magentocommerce.com',
 'AT&T': 'about.att.com',
 'Hillhouse Capital Group': '',
 'Keymetrics': 'keymetrics.io',
 'Bain Capital Ventures': '',
 'Engine Yard': 'engineyard.com',
 'AngelList': 'angel.co',
 'the Cloud Native Computing Foundation': '',
 'Lightspeed Venture Partners': '',
 'Union Square Ventures': '',
 'Microsoft Ventures': 'blogs.technet.com',
 'Sequoia': 'sequoia.bio',
 'Deis': 'engineyard.com/deis'}

In [106]:
#Evaluate over the golden set

In [31]:
df_gold = pd.read_json("./news_articles-gold.jsonl",lines=True)

In [32]:
annotations_pred_list = []
for _,row in df_gold.iterrows():
    org_list = find_org_entities(row['text'])
    annotations_pred = entity_linking_simple(org_list)
    annotations_pred_list.append(annotations_pred)
df_gold['annotations_pred'] = annotations_pred_list

In [34]:
df_gold["annotations"][0]

{'WeWork': 'wework.com',
 'Benchmark': 'benchmark.com',
 'Zynga': 'zynga.com',
 'Groupon': 'groupon.com',
 'SoftBank Group': 'group.softbank',
 'WeWorks': 'wework.com',
 'SoftBank': 'group.softbank',
 'Blue Apron': 'blueapron.com'}

In [35]:
df_gold["annotations_pred"][0]

{'SoftBank': 'softbank.jp',
 'Twitter': 'twitter.com',
 'Blue Apron': 'blueapron.com',
 'The Telegraph': '',
 'WeWorks': '',
 'The Wall Street Journal': '',
 'Zynga': 'zynga.com',
 'WeWork': 'wework.com',
 'Telegraph': '',
 'AFP': '',
 'Groupon': 'groupon.com',
 'Benchmark': 'benchmark.com',
 'SoftBank Group': 'group.softbank',
 'Crunchbase': 'crunchbase.com',
 'the Crunchbase Daily': ''}

In [39]:
import numpy as np

In [40]:
def evaluate(df_gold):
    ner_score_list = []
    nel_score_list = []
    for _,row in df_gold.iterrows():
        common_keys = set(row['annotations']).intersection(row['annotations_pred'])
        ner_recall = len(common_keys)/len(df_gold['annotations'][0])
        ner_score_list.append(ner_recall)
        original_entity_list = []
        pred_entity_list = []
        tp = 0
        for common_key in common_keys:
            original_entity = row['annotations'][common_key]
            pred_entity = row['annotations_pred'][common_key]
            if (original_entity==pred_entity):
                tp += 1
            original_entity_list.append(original_entity)
            pred_entity_list.append(pred_entity)
        nel_presion = tp/len(common_keys)
        nel_score_list.append(nel_presion)       
    return np.mean(ner_score_list), np.mean(nel_score_list)

In [41]:
ner_score, nel_score= evaluate(df_gold)

In [42]:
ner_score, nel_score

(0.540625, 0.5816666666666667)