In [1]:
import csv
import spacy
import en_core_web_sm
from bs4 import BeautifulSoup
import requests
import re

In [2]:
# get all the distict urls in the train data
urls = set()
train_compA_compB_url = []
with open("./train_data_arboretica.csv",'r', encoding='UTF-8') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # skip the header
    for row in csv_reader:
        row_url = row[3]
        train_compA_compB_url.append([row[0].lower(),row[1].lower(),row_url])
        if ';' in row_url:
            multiple_urls = row_url.split(';')  # split the urls which are combined together in train data
            for x in multiple_urls:
                urls.add(x)
        else:
            urls.add(row_url)
urls = list(urls)

In [3]:
print(len(train_compA_compB_url))  # number of training data
print(len(urls)) # number of distinct urls

330
277


In [4]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [5]:
nlp = spacy.load('en_core_web_sm', disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
url_orgs = dict()
for url in urls:
    ny_bb = url_to_string(url)
    article = nlp(ny_bb)
    orgs = list(set([x.text.lower() for x in article.ents if x.label_ == 'ORG' and len(x.text) < 100]))
    url_orgs[url] = orgs

In [6]:
failure = 0
failure_data = [] # store unrecognized training data entities
for data in train_compA_compB_url:  # check whether both company A and B were recognized in the corresponding article 
    company_list = []
    if ';' in data[2]:
        for url_1 in data[2].split(';'):
            company_list += url_orgs[url_1]
    else: 
        company_list += url_orgs[data[2]]
        
    # some company names are shorter in the traning data than in the original article, 
    # for example, 'fortino capital' in training data vs 'fortino capital partners' in the article
    companies = ",".join(company_list)  
    if data[0] not in companies or data[1] not in companies:
        failure_data.append([data[0],data[1],data[2],companies])
        failure += 1

In [7]:
print('Number of unrecognized training data: ' + (str) (failure))
print('Number of training data: ' + (str) (len(train_compA_compB_url)))
print('True Positive Rate: ' + (str) (1 - failure/len(train_compA_compB_url)))

Number of unrecognized training data: 127
Number of training data: 330
True Positive Rate: 0.6151515151515152


In [8]:
# some data could not be recognized because the request of getting html was blocked
print('Number of failed requests to get html: ' + (str) (len([x for x in url_orgs.values() if len(x)==0])))

Number of failed requests to get html: 11


In [9]:
# examples of entities that were not recognized
for i in range(5):
    print('Company A: ' + failure_data[i][0])
    print('Company B: ' + failure_data[i][1])
    print('Url: ' + failure_data[i][2])
    print('Recognized entities: ' + failure_data[i][3] + '\n')

Company A: fortino capital
Company B: newion
Url: https://www.eu-startups.com/2021/07/luxembourg-based-salonkee-raises-e6-2-million-for-the-european-expansion-of-it-beauty-salon-reservations-software/
Recognized entities: ecoligo,fortino capital,tattoodo,wizata nabs,eu-startups     sign,holidu,outthink,expon capital,co-founder,mission / team advertising our newsletter,gravitee.io,tripadmit,flappin’,ai june 2,thomas ohrthomas ohr,api,edition job board job board post,denmark-startups   copenhagen,salonkee

Company A: fortino capital
Company B: charles souillard
Url: https://finance.yahoo.com/news/bonitasoft-exceeds-2021-forecast-revenue-131500489.html
Recognized entities: 

Company A: fortino capital
Company B: miguel valdes
Url: https://finance.yahoo.com/news/bonitasoft-exceeds-2021-forecast-revenue-131500489.html
Recognized entities: 

Company A: fortino capital
Company B: autodesk
Url: https://tech.eu/2021/01/15/oqton-funding
Recognized entities: cnc,cam,fortino capital,newsletter,kan