In [None]:
!pip install spacy==3.0.6
!python -m spacy download en_core_web_lg

#### Import necessary libraries

In [1]:
import pandas as pd
import spacy
from spacy.kb import InMemoryLookupKB

#### Read the 3 json files into dataframes (just for visualizing samples)

In [2]:
news_articles_new_df = pd.read_json("news_articles-new.jsonl", lines=True)

In [3]:
news_articles_gold_df = pd.read_json("news_articles-gold.jsonl", lines=True)

In [4]:
companies_df = pd.read_json("company_collection.json")

#### Assign unique ID to each company

In [5]:
companies_df['QID'] = companies_df.index
companies_df['QID'] = companies_df['QID'].apply(lambda elem: str(elem))

#### View the companies info

In [6]:
companies_df.head(2)

Unnamed: 0,name,founded,description,url,headquarters,industry_label,QID
0,100 Thieves,2016,"100 Thieves, LLC, an esports organization, com...",100thieves.com,United States,Retail & commerce | Sports & gaming,0
1,ten eleven ventures,2014,,1011vc.com,,Banking & finance,1


#### View the "news_articles-gold" data

In [7]:
news_articles_gold_df.head(2)

Unnamed: 0,title,text,annotations,source
0,"Report: Adam Neumann, Benchmark Unloaded $676....","Report: Adam Neumann, Benchmark Unloaded $676....","{'WeWork': 'wework.com', 'Benchmark': 'benchma...",https://news.crunchbase.com/news/report-adam-n...
1,Report: Palantir Could Go Public In The Next Year,Report: Palantir Could Go Public In The Next Y...,"{'Palantir Technologies': 'palantir.com', 'Pal...",https://news.crunchbase.com/news/report-palant...


#### View the "news_articles-new" data

In [8]:
news_articles_new_df.head(2)

Unnamed: 0,title,text,source
0,Pinterest May Go Public In Q2 After Growing Ar...,Pinterest May Go Public In Q2 After Growing Ar...,https://news.crunchbase.com/news/pinterest-may...
1,Arlo Opens At $18.50 After Pricing At $16,Arlo Opens At $18.50 After Pricing At $16. Mor...,https://news.crunchbase.com/news/arlo-opens-at...


#### Number of companies in total

In [9]:
len(companies_df['name'])

3330

#### Number of unique companies based on name

In [10]:
len(companies_df['name'].unique())

3293

#### Number of companies with no description

In [11]:
len([desc for desc in companies_df['description'] if desc.strip() == ''])

165

#### Number of articles with annotations

In [12]:
len(news_articles_gold_df)

40

#### Number of articles without annotations

In [13]:
len(news_articles_new_df)

60

#### Load the spacy model which will be used as base model for entity linker

In [14]:
nlp = spacy.load("en_core_web_lg")

#### Create dictionary of names, descriptions and urls for the entities (companies)

In [15]:
qid_to_names = {}
qid_to_descriptions = {}
qid_to_urls = {}

for index, row in companies_df.iterrows():
    qid_to_names[row['QID']] = row['name']
    qid_to_descriptions[row['QID']] = row['description']
    qid_to_urls[row['QID']] = row['url']

#### Create the knowledge base and save it to disk

In [16]:
kb = InMemoryLookupKB(vocab=nlp.vocab, entity_vector_length=300)

In [17]:
for qid, desc in qid_to_descriptions.items():
    desc_doc = nlp(desc)
    desc_encoding = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_encoding, freq=342)

In [18]:
for qid, name in qid_to_names.items():
    kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])

  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=[1])
  kb.add_alias(alias=name, entities=[str(qid)], probabilities=

In [19]:
kb.to_disk("my_kb")

#### Prepare the dataset in the appropriate format for training

In [20]:
def get_entity_info_from_text(text:str, entity:str):
    """
    Given a text and a named entity, this function returns the entity info (start index, end index, ent label)
    """
    doc = nlp(text)
    entity_info = []
    for ent in doc.ents:
        if ent.text == entity:
            entity_info.append((ent.start_char, ent.end_char, 'ORG'))
    return entity_info

In [21]:
def get_company_id_from_url(url:str, company_name):
    qids = list(qid_to_urls.keys())
    urls = list(qid_to_urls.values())
    
    try:
        # url is found in qid_to_urls dict, hence return the qid
        index = urls.index(url) 
        return qids[index]
    except ValueError as e:
        """
            url is not found in qid_to_urls, hence add the url to the qid_to_urls 
            and also update qid_to_names and qid_to_descriptions dictionaries. Return the new qid      
        """
        print(url + " not found in existing qid_to_urls dictionary")
        new_id = str(len(qid_to_urls)+1)
        qid_to_urls[new_id] = url
        qid_to_names[new_id] = company_name
        qid_to_descriptions = ""
        return new_id

In [22]:
def prepare_dataset():
    dataset = []
    
    for index, row in news_articles_gold_df.iterrows():
        annotations = row['annotations']
        text = row['text']
        links = {}
        entities = []
        
        for company_name, url in annotations.items():
            entity_id = get_company_id_from_url(url, company_name)
            
            entity_info = get_entity_info_from_text(text, company_name)
            for (start_index, end_index, ent_label) in entity_info:
                links[(start_index, end_index)] = {entity_id: 1.0}
                entities.append((start_index, end_index, ent_label))
                
        data = (
            text,
            {
                "links": links,
                "entities": entities
            },
            
        )

        dataset.append(data)
            
    return dataset

In [23]:
dataset = prepare_dataset()

sparxsif.com/en not found in existing qid_to_urls dictionary
incubatefund.com/en/aboutus not found in existing qid_to_urls dictionary
tte-net.com/english/index.html not found in existing qid_to_urls dictionary
companionfund.com not found in existing qid_to_urls dictionary
leftlanecap.com not found in existing qid_to_urls dictionary
darktrace.com/en not found in existing qid_to_urls dictionary
blackberry.com/us/en/cylance not found in existing qid_to_urls dictionary
breakthroughenergy.org not found in existing qid_to_urls dictionary
granthamtrust.org not found in existing qid_to_urls dictionary
blingcap.com not found in existing qid_to_urls dictionary
fireboltventures.com not found in existing qid_to_urls dictionary
capital-en.tcl.com not found in existing qid_to_urls dictionary
flourishventures.com not found in existing qid_to_urls dictionary
myhippo.com not found in existing qid_to_urls dictionary
eu.lululemon.com not found in existing qid_to_urls dictionary
uber.com/de/en not found i

In [24]:
dataset[34]

('Accolade Is Latest To Join Health Service IPO Bandwagon. To launch a successful IPO in the current market environment, it seems to help to be a fast-growing health care services provider. Shares of One Medical , a provider of primary care clinics and telemedicine, closed up nearly 60 percent in first-day trading a month ago. Since then, it’s largely held on to those gains. Subscribe to the Crunchbase Daily Progyny , a benefits management focusing on fertility, meanwhile, has seen its shares roughly double from its initial offer price back in October. While the broader markets have swooned, Progyny has held strong. Now, another well-funded health service company is betting investor enthusiasm for the space will trump market skittishness. Accolade , a service provider that serves as a kind of go-between for consumers, employers and health insurance companies, is seeking to raise up to $100 million in an IPO, according to a prospectus filed late Friday. Like most venture-backed companie

#### Observations

- some urls in gold data not found in companies list

- different companies with same url found in gold data

- same company name, but different URLs in companies file, e.g. Endeavor and a few others

- same company name, but different URLs in annotated file, e.g. "uber.com" and "uber.com/de/en" for company Uber
    - this was a challenge because, as part of my algorithm, to create the training data, I obtain the entity ID by matching the URL in companies list with the URL given in annotated data
     
    I am using URL and not name, because I found all URLs in the companies list to be unique, but found duplicates in company name

- "description" field has junk info e.g. "cloud-data_crunchbase_2011 worthy Appin tweetprocesor stanford group.pdf."

- for 165 companies, "description" field is empty. We need to collect those descriptions

- many cases where "name" field has URL, e.g. "name": "Andreessen Horowitz a16z.com"

- "name" field has special chars, e.g.   "name": "Alb\u00e9a Group"

#### Improvements that I would have done if I had more time 

- clean up the companies file 
    - add descriptions for companies wherever description was empty (if allowed, we can use services like CoreSignal API)
    - remove urls from company names 
            use this regex to detect such cases in the name field
            .+ (.*\.(com|io|vc))
    - clean descriptions wherever junk info was there
- gather aliases/synonyms for company names (if allowed, we can use services like Seravia API)
    - add to knowledge base
    - this could improve model performance
- gather more annotated data for training 
- try using some transformer model as base model within spacy instead of en_core_web_lg
- perform more detailed hyperparameter tuning when training the EL model
- stratify the train and test dataset based on entity_id, so that the model can learn uniformly well across entity IDs

#### Shuffle the train and test dataset

In [25]:
import random

train_dataset = dataset[0:30]
test_dataset = dataset[30:40]

random.shuffle(train_dataset)
random.shuffle(test_dataset)

#### Train the EL model

In [26]:
from spacy.training import Example

TRAIN_EXAMPLES = []

if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
    
sentencizer = nlp.get_pipe("sentencizer")
count=0
for text, annotation in train_dataset:
    example = Example.from_dict(nlp.make_doc(text), annotation)
    example.reference = sentencizer(example.reference)
    TRAIN_EXAMPLES.append(example)


In [27]:
from spacy.ml.models import load_kb

entity_linker = nlp.add_pipe("entity_linker", config={"incl_prior": False}, last=True)
entity_linker.initialize(get_examples=lambda: TRAIN_EXAMPLES, kb_loader=load_kb("my_kb"))

In [28]:
from spacy.util import minibatch, compounding

with nlp.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp.resume_training()
    for itn in range(500):   # 500 iterations 
        random.shuffle(TRAIN_EXAMPLES)
        batches = minibatch(TRAIN_EXAMPLES, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            nlp.update(
                batch,   
                drop=0.2,      # to prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

0 Losses {'entity_linker': 5.923153295781878}
50 Losses {'entity_linker': 1.2630623374866623}
100 Losses {'entity_linker': 1.1509348904093106}
150 Losses {'entity_linker': 1.1877119670197427}
200 Losses {'entity_linker': 0.8379911234824664}
250 Losses {'entity_linker': 1.598684724757814}
300 Losses {'entity_linker': 1.6752873020491215}
350 Losses {'entity_linker': 0.8064408239391114}
400 Losses {'entity_linker': 0.8123272939026356}
450 Losses {'entity_linker': 1.3152976576285353}
499 Losses {'entity_linker': 1.3444068133831024}


#### Test the model on the test dataset, which was unseen during training 

In [29]:
texts = [text for text, true_annot in test_dataset]
true_annotations = [true_annot['links'] for text, true_annot in test_dataset]
docs = nlp.pipe(texts)

# gathering the model predictions (entity IDs) in the right format for evaluation 
overall_model_predictions = []
for doc in docs:
    model_predictions = set()
    for ent in doc.ents:
        if ent.kb_id_ != 'NIL':
            model_predictions.add(ent.kb_id_)
    overall_model_predictions.append(model_predictions)
    
# gathering the gold entities in the right format for evaluation 
overall_gold_entity_ids = []
for true_annot in true_annotations:
    gold_entity_ids = set()
    for key, value in true_annot.items():
        for entity_id, _ in value.items():
            gold_entity_ids.add(entity_id)
    overall_gold_entity_ids.append(gold_entity_ids)

In [30]:
overall_model_predictions

[{'1761', '2448', '2488'},
 {'1023'},
 {'1021', '1224', '1660', '2042'},
 {'1068', '2771', '3036'},
 {'2077', '2336', '766'},
 {'1726', '2312', '2458', '783'},
 {'2362', '49', '561', '685'},
 {'1191', '1217', '2722', '380'},
 {'186', '2892', '907'},
 {'2636', '2873', '3218', '566'}]

In [31]:
overall_gold_entity_ids

[{'1761', '2448', '2488', '3358'},
 {'1023', '27', '3355'},
 {'1021', '1224', '1660', '2042', '27', '3357'},
 {'1068', '2080', '2771', '3036'},
 {'1504', '2077', '2336', '765'},
 {'1726', '2312', '2458', '783'},
 {'2362', '49', '561', '685'},
 {'1191', '1217', '3354', '380'},
 {'1504', '186', '3356', '907'},
 {'1132', '2636', '3218', '3353', '566'}]

#### Evaluate the model which we have trained 

In [32]:
avg_precision = 0.0
avg_recall = 0.0

for model_prediction, gold_entities in zip(overall_model_predictions, overall_gold_entity_ids):
    intersection = model_prediction.intersection(gold_entities)
    precision = float(len(intersection)/len(model_prediction)) * 100
    recall = float(len(intersection)/len(gold_entities)) * 100
    avg_precision = avg_precision + precision
    avg_recall = avg_recall + recall

avg_recall = avg_recall/len(overall_model_predictions)
avg_precision = avg_precision/len(overall_model_predictions)

print("Average precision (on unseen test data): " + str(round(avg_precision,2)))
print("Average recall (on unseen test data): " + str(round(avg_recall,2)))

Average precision (on unseen test data): 88.33
Average recall (on unseen test data): 68.5


#### Run predictions on "news_articles-new" and save to file

In [46]:
texts = news_articles_new_df['text'].tolist()
docs = nlp.pipe(texts)
annotations = []

for doc in docs:
    annotation = {}
    for ent in doc.ents:
        if ent.kb_id_ != 'NIL': # entities which have been linked to KB
            url = qid_to_urls[ent.kb_id_]
            annotation[ent.text] = url
        else: 
            if ent.label_ == 'ORG': # entities which have been identified as ORG by the model, but not found in KB
                annotation[ent.text] = ''
    annotations.append(annotation)

In [47]:
news_articles_new_df['annotation'] = annotations

In [48]:
news_articles_new_df.head()

Unnamed: 0,title,text,source,annotation
0,Pinterest May Go Public In Q2 After Growing Ar...,Pinterest May Go Public In Q2 After Growing Ar...,https://news.crunchbase.com/news/pinterest-may...,"{'Morning Markets': '', 'the Wall Street Journ..."
1,Arlo Opens At $18.50 After Pricing At $16,Arlo Opens At $18.50 After Pricing At $16. Mor...,https://news.crunchbase.com/news/arlo-opens-at...,"{'IPO': '', 'Sonos': 'sonos.com', 'Arlo Techno..."
2,"SoftBank, Kakao Give Radish $63M Series A Boos...","SoftBank, Kakao Give Radish $63M Series A Boos...",https://news.crunchbase.com/news/softbank-kaka...,"{'SoftBank': 'softbank.jp', 'Kakao Give': '', ..."
3,Open Source Software Is Big Business With Big ...,Open Source Software Is Big Business With Big ...,https://news.crunchbase.com/news/open-source-s...,"{'OSS': '', 'Microsoft': 'azure.microsoft.com/..."
4,"Deflated, Uber May Swap Anniversary Balloons F...","Deflated, Uber May Swap Anniversary Balloons F...",https://news.crunchbase.com/news/deflated-uber...,"{'Deflated': '', 'Uber': 'uber.com', 'the Crun..."


In [49]:
news_articles_new_df.to_json("news_articles-linked.jsonl", orient='records')