In [15]:
import csv
import spacy
import en_core_web_sm
from bs4 import BeautifulSoup
import requests
import re

In [44]:
# get all the distict urls in the train data
urls = set()
train_compA_compB_url = []
with open("./train_data_arboretica.csv",'r', encoding='UTF-8') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # skip the header
    for row in csv_reader:
        row_url = row[3]
        if row_url.lstrip().rstrip() == 'https://www.patentlyapple.com/patently-apple/2016/11/taiwans-largest-industrial-conglomerate-now-considering-opening-plants-in-america-if-trump-reintroduces-tariffs.html':
            continue
        train_compA_compB_url.append([row[0].lower(),row[1].lower(),row_url])
        if ';' in row_url:
            multiple_urls = row_url.split(';')  # split the urls which are combined together in train data
            for x in multiple_urls:
                if 'https://www.patentlyapple.com/patently-apple/2016/11/taiwans-largest-industrial-conglomerate-now-considering-opening-plants-in-america-if-trump-reintroduces-tariffs.html' in x:
                    continue
                urls.add(x)
        else:
            urls.add(row_url)
urls = list(urls)

In [45]:
print(len(train_compA_compB_url))  # number of training data
print(len(urls)) # number of distinct urls

329
275


In [35]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [38]:
roberta_nlp  = spacy.load('en_core_web_trf', disable=['tok2vec','tagger','parser','attribute_ruler','lemmatizer'])
url_orgs = dict()
for url in urls:
    ny_bb = url_to_string(url)
#     if (len(ny_bb)>512):  # Token indices maximum sequence length is 512
#         ny_bb = ny_bb[:512]
    article = roberta_nlp(ny_bb)
    orgs = list(set([x.text.lower() for x in article.ents if x.label_ == 'ORG' and len(x.text) < 100]))
    url_orgs[url] = orgs

Token indices sequence length is longer than the specified maximum sequence length for this model (2792 > 512). Running this sequence through the model will result in indexing errors


In [84]:
failure = 0
failure_data = [] # store unrecognized training data entities
for data in train_compA_compB_url:  # check whether both company A and B were recognized in the corresponding article 
    company_list = []
    if ';' in data[2]:
        for url_1 in data[2].split(';'):
            if 'https://www.patentlyapple.com/patently-apple/2016/11/taiwans-largest-industrial-conglomerate-now-considering-opening-plants-in-america-if-trump-reintroduces-tariffs.html' in url_1:
                    continue
            company_list += url_orgs[url_1]
    else: 
        company_list += url_orgs[data[2]]
        
    # some company names are shorter in the traning data than in the original article, 
    # for example, 'fortino capital' in training data vs 'fortino capital partners' in the article
    companies = ",".join(company_list)  
    if (data[0] not in companies or data[1] not in companies) and len(companies)!=0:
        failure_data.append([data[0],data[1],data[2],companies])
        failure += 1

# exclude those training data for which we cannot get html or the website is invalid
num_request_failure = len([x for x in url_orgs.values() if len(x)==0])
print('Number of failed request to get html: ' + (str) (num_request_failure))

Number of failed request to get html: 30


In [85]:
print('Number of unrecognized training data: ' + (str) (failure))
print('Number of valid training data: ' + (str) (len(train_compA_compB_url) - num_request_failure))
print('True Positive Rate: ' + (str) (1 - failure/len(train_compA_compB_url)))

Number of unrecognized training data: 59
Number of valid training data: 299
True Positive Rate: 0.8206686930091185


In [86]:
# examples of entities that were not recognized
for i in range(5):
    print('Company A: ' + failure_data[i][0])
    print('Company B: ' + failure_data[i][1])
    print('Url: ' + failure_data[i][2])
    print('Recognized entities: ' + failure_data[i][3] + '\n')

Company A: fortino capital
Company B: melita
Url: https://www.capacitymedia.com/articles/3824316/eqt-adds-to-telecoms-portfolio-by-buying-german-fibre-network-inexio-for-1bn
Recognized entities: cloudfront

Company A: fortino capital
Company B: eqt
Url: https://www.capacitymedia.com/articles/3824316/eqt-adds-to-telecoms-portfolio-by-buying-german-fibre-network-inexio-for-1bn
Recognized entities: cloudfront

Company A: fortino capital
Company B: duco sickinghe
Url: https://trends.knack.be/economie/bedrijven/brusselse-start-up-aproplan-wil-groeien-met-fusie-ervaren-ceo-en-vers-kapitaal/article-normal-1423401.html
Recognized entities: bouygues immobilier,fortino capital,bouwbedrijf matexi,spreds,deense,letsbuild,aproplan,inventures,genieblet,telenet,treatwell,just eat,de deense multinational solar,het deense geniebelt,crunchbase,geniebelt,geniebelt vooral scandinavië

Company A: fortino capital
Company B: vï¿½ï¿½ï¿½ï¿½ï¿½ï¿
Url: https://www.finsmes.com/2017/03/veronique-peeters-joins-fort