# Comparison to Baseline Systems
This notebooks execution has not been tested and is meant for viewing.

In this notebook the comparison of the adapted system design which uses context graph-embeddings for entity linking to simple baseline approaches is performed.

# Imports

In [2]:
import spacy
from spacy import displacy
import en_core_web_lg
nlp = spacy.load('en_core_web_lg')
import numpy as np
import pandas as pd
import pickle
import os
from nltk.tokenize import sent_tokenize
from fuzzywuzzy import process
from collections import Counter
from tqdm.notebook import tqdm
from timeit import default_timer as timer
from datetime import timedelta



In [54]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [55]:
# Spacy Pipeline Components
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x18d66f6d308>

In [56]:
doc1 = nlp('My sister has a dog. She loves him.')
print(doc1._.coref_clusters)

doc2 = nlp('Angela lives in Boston. She is quite happy in that city.')
for ent in doc2.ents:
    print(ent._.coref_cluster)
    

[My sister: [My sister, She], a dog: [a dog, him]]
Angela: [Angela, She]
Boston: [Boston, that city]


# spaCy Stanza
For spaCy version 2.x 
More info
https://github.com/explosion/spacy-stanza/tree/2855138759541c149e277c97b892a0a1889bb609

In [57]:
import stanza
from spacy_stanza import StanzaLanguage

snlp = stanza.Pipeline(lang="en")
nlp = StanzaLanguage(snlp)

doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)
print(doc.ents)

2022-03-23 09:45:21 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2022-03-23 09:45:21 INFO: Use device: cpu
2022-03-23 09:45:21 INFO: Loading: tokenize
2022-03-23 09:45:22 INFO: Loading: pos
2022-03-23 09:45:22 INFO: Loading: lemma
2022-03-23 09:45:22 INFO: Loading: depparse
2022-03-23 09:45:22 INFO: Loading: sentiment
2022-03-23 09:45:23 INFO: Loading: ner
2022-03-23 09:45:24 INFO: Done loading processors!


Barack Barack PROPN nsubj:pass PERSON
Obama Obama PROPN flat PERSON
was be AUX aux:pass 
born bear VERB root 
in in ADP case 
Hawaii Hawaii PROPN obl GPE
. . PUNCT punct 
He he PRON nsubj:pass 
was be AUX aux:pass 
elected elect VERB root 
president president NOUN xcomp 
in in ADP case 
2008 2008 NUM obl DATE
. . PUNCT punct 
(Barack Obama, Hawaii, 2008)


# Matching Documents using Document Similarity Measures
We will use Document Similarity Measures to compute the Similartiy between documents

In [3]:
# This Excel will serve as our Ontology
wikitravelLocationArticles = pd.read_excel('wikitravelLocationWikipediaArticles.xlsx')
wikitravelLocationArticles.head(5)

Unnamed: 0,Wikibase Item QID,Article Name,Wikipedia Article Link,Article Summary
0,Q445486,Altstadt (Düsseldorf),https://en.wikipedia.org/wiki/Altstadt_(D%C3%B...,"The Altstadt (literally ""old town"") is one of ..."
1,Q449266,Altstadt (Frankfurt am Main),https://en.wikipedia.org/wiki/Altstadt_(Frankf...,The Altstadt (old town) is a quarter (Stadttei...
2,Q819081,Topography of Terror,https://en.wikipedia.org/wiki/Topography_of_Te...,The Topography of Terror (German: Topographie ...
3,Q1973070,Neanderkirche,https://en.wikipedia.org/wiki/Neanderkirche,The Neanderkirche (Neander Church) is a Protes...
4,Q679491,Rheinturm,https://en.wikipedia.org/wiki/Rheinturm,The Rheinturm (pronounced [ˈʁaɪ̯ntʊʁm]) (Rhine...


In [4]:
wikitravelEntityLinking = pd.read_excel('EntityLinkingDataset.xlsx')
wikitravelEntityLinking.head(5)

Unnamed: 0,Wikitravel Article Link,Mention,Description,Wikibase Item QID,Unique Link?
0,https://wikitravel.org/en/D%C3%BCsseldorf,Altstadt,"Old town (Altstadt), (U-Bahn stop: Heinrich-He...",Q445486,N
1,https://wikitravel.org/en/D%C3%BCsseldorf,Neanderkirche,The Neander-church (Neanderkirche) has its own...,Q1973070,Y
2,https://wikitravel.org/en/D%C3%BCsseldorf,Rheinturm,"Rhine Tower (Rheinturm) , (Tram stop: Platz de...",Q679491,Y
3,https://wikitravel.org/en/London,Buckingham Palace,Buckingham Palace - The official London reside...,Q42182,Y
4,https://wikitravel.org/en/London,Marble Arch,Marble Arch is a white Carrara marble monument...,Q845529,Y


# Candidate Set Generation

In [61]:
# Candidate Generation using FuzzyWuzzy String Matching
from fuzzywuzzy import process
str2Match = "apple inc"
strOptions = ["Apple Inc.","apple park","apple incorporated","iphone", "apple south", "app store", "apple iphone", "apply", "apps", "App Incorp"]
Ratios = process.extract(str2Match,strOptions, limit=10)
print(Ratios)
truth = "Apple Inc."
truth_position = 100
for i, option in enumerate(Ratios):
    if(option[0] == truth):
        truth_position = i
        break
# You can also select the string with the highest matching percentage
highest = process.extractOne(str2Match,strOptions)
print(highest)
print(truth_position)

[('Apple Inc.', 100), ('apple incorporated', 90), ('apple iphone', 76), ('App Incorp', 74), ('apply', 72), ('apps', 68), ('apple park', 67), ('apple south', 67), ('app store', 44), ('iphone', 30)]
('Apple Inc.', 100)
0


In [62]:
# We will take the first Entry in the table as a Example
mention = wikitravelEntityLinking['Mention'][9]
# mention = 'Altstadt'

str2Match = mention
strOptions = wikitravelLocationArticles['Article Name'].tolist()
ratios = process.extract(str2Match,strOptions, limit=None)
ratios = [x for x in ratios if x[1]>50]
print(ratios)

# You can also select the string with the highest matching percentage
highest = process.extractOne(str2Match,strOptions)
print(highest)

print(len(ratios))

[('Chinese garden', 100), ('Chinese Garden, Singapore', 90), ('Orangery (Royal Garden of Prague Castle)', 86), ('Royal Botanic Garden, Sydney', 86), ('Botanic Garden Zuidas', 86), ('Englischer Garten', 58), ('Grand Arche', 53), ('Chinatown, Darwin', 52), ('Glasgow Botanic Gardens', 51), ('Hessisches Landesmuseum Darmstadt', 51), ('Rheinisches Landesmuseum', 51), ('Rheinisches Landesmuseum Bonn', 51), ('Rheinisches Landesmuseum Trier', 51)]
('Chinese garden', 100)
13


In [63]:
# Candidate set generation (no limits / not on number of entries or lowest similarity score)
wikitravelEntityLinking['Candidate Index Set'] = ""
ontology = wikitravelLocationArticles['Article Name'].tolist()

for index, row in wikitravelEntityLinking.iterrows():
    index_set = []
    mention = row['Mention']

    ratios = process.extract(mention, ontology, limit=None)
    ratios = [x for x in ratios if x[1]>50]
    for candidate in ratios:
        index_set.extend(wikitravelLocationArticles.index[wikitravelLocationArticles['Article Name'] == candidate[0]].tolist())
    row['Candidate Index Set'] = index_set

# Simple approach
The metric for according to nlp-progress is micro-precision: Fraction of correctly disambiguated named entities in the full corpus.

In the first naive approach we will first only assign the best fuzzy matching candidate from the generated candidate set.
In further steps we will multiply the string matching values with further context similarity values.

In [64]:
wikitravelLocationArticles.loc[wikitravelLocationArticles['Article Name'] == 'Neanderkirche']['Wikibase Item QID'].tolist()[0]

'Q1973070'

In [90]:
start = timer()
strOptions = wikitravelLocationArticles['Article Name'].tolist()
wikitravelEntityLinking['Simple Prediction'] = ""
predictions = []
top_10_pred = []
for index, row in wikitravelEntityLinking.iterrows():
    # We will take the first Entry in the table as a Example
    mention = row['Mention']
    # mention = 'Altstadt'

    str2Match = mention
    ratios = process.extract(mention,strOptions, limit=10)

    truth = row["Wikibase Item QID"]
    truth_position = 100
    for i, option in enumerate(ratios):
        pre_qid = wikitravelLocationArticles.loc[wikitravelLocationArticles['Article Name'] == option[0]]['Wikibase Item QID'].tolist()[0]
        if(pre_qid == truth):
            truth_position = i
            break
    top_10_pred.append(truth_position)

    highest = process.extractOne(mention,strOptions)
    pred = wikitravelLocationArticles.loc[wikitravelLocationArticles['Article Name'] == highest[0]]['Wikibase Item QID'].tolist()[0]
    predictions.append(pred)

wikitravelEntityLinking['Simple Prediction'] = predictions
wikitravelEntityLinking['Simple Top 10 Prec'] = top_10_pred
end = timer()
print(timedelta(seconds=end-start))

0:00:01.761607


In [91]:
def check_results_simple_prediction(x):    
    return True if x['Wikibase Item QID'] == x["Simple Prediction"] else False
wikitravelEntityLinking['Simple Predition Result'] = wikitravelEntityLinking.apply(check_results_simple_prediction, axis=1)

In [92]:
wikitravelEntityLinking['Simple Predition Result'].value_counts()

True     64
False    16
Name: Simple Predition Result, dtype: int64

In [93]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'Y']['Simple Predition Result'].value_counts()

True     52
False     2
Name: Simple Predition Result, dtype: int64

In [94]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'N']['Simple Predition Result'].value_counts()

True     12
False     8
Name: Simple Predition Result, dtype: int64

In [95]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'NIL']['Simple Predition Result'].value_counts()

False    6
Name: Simple Predition Result, dtype: int64

# Jaccard Context Comparison
In the simple approach we saw that there are predictions that do not match even if the fuzzy mention-title matching gives a value of 90 - 100% matching rate. To ensure that the selcted candidate really is the right one we need to include the context of both the mention and the candidate and ensure that those are matching as well. If it is not the case we need to adapt and lower the ranking of that specific candidate and look at the context comparison with the next one.

One very simple and basic paragraph or document comparision algorithm is the Jaccard Comparison which calculates the intersection of the word in two documents.

In [130]:
import string
import nltk

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

base_document = wikitravelEntityLinking['Description'][0] # Altstadt Düsseldorf
documents = wikitravelLocationArticles.iloc[wikitravelEntityLinking['Candidate Index Set'][0]]['Article Summary'].tolist()

def preprocess(text):
    # Steps:
    # 1. lowercase
    # 2. Lammetize. (It does not stem. Try to preserve structure not to overwrap with potential acronym).
    # 3. Remove stop words.
    # 4. Remove punctuations.
    # 5. Remove character with the length size of 1.

    lowered = str.lower(text)

    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(lowered)

    words = []
    for w in word_tokens:
        if w not in stop_words:
            if w not in string.punctuation:
                if len(w) > 1:
                    lemmatized = lemmatizer.lemmatize(w)
                    words.append(lemmatized)

    return words

def calculate_jaccard(word_tokens1, word_tokens2):
    # Combine both tokens to find union.
    both_tokens = word_tokens1 + word_tokens2
    union = set(both_tokens)

    # Calculate intersection.
    intersection = set()
    for w in word_tokens1:
        if w in word_tokens2:
            intersection.add(w)

    jaccard_score = len(intersection)/len(union)
    return jaccard_score

def process_jaccard_similarity(base_document, documents):

    # Tokenize the base document we are comparing against.
    base_tokens = preprocess(base_document)

    # Tokenize each document
    all_tokens = []
    for i, document in enumerate(documents):
        tokens = preprocess(document)
        all_tokens.append(tokens)

    # print("making word tokens at index:", i)

    all_scores = []
    for tokens in all_tokens:
        score = calculate_jaccard(base_tokens, tokens)

        all_scores.append(score)

    highest_score = 0
    highest_score_index = 0
    
    sorted_index = sorted(range(len(all_scores)), key=lambda k: all_scores[k], reverse=True)
    
    highest_score_index = sorted_index[0]
    highest_score = all_scores[sorted_index[0]]
    
#     for i, score in enumerate(all_scores):
#         if highest_score < score:
#             highest_score = score
#             highest_score_index = i

    most_similar_document_idxs = []
    for idx in sorted_index:
        current_doc = documents[idx]
        most_similar_document_index = wikitravelLocationArticles.index[wikitravelLocationArticles['Article Summary'] == current_doc].tolist()[0]
        most_similar_document_idxs.append(most_similar_document_index)
        
    most_similar_document = documents[highest_score_index]

    # print("Most similar document by Jaccard with the score:", most_similar_document, highest_score)
    most_similar_document_index = wikitravelLocationArticles.index[wikitravelLocationArticles['Article Summary'] == most_similar_document].tolist()[0]
    # print('Index of most similar document ', most_similar_document_index)
#     if highest_score > 0.02: 
    return most_similar_document_idxs
#     else:
#         return "NIL"

start = timer()
process_jaccard_similarity(base_document, documents)


[0, 1, 94, 90, 12, 21]

"NIL" Prediction cannot be done on this base since the jaccard comparision values are very low and a good threshold cannot be generally defined

In [120]:
#wikitravelLocationArticles['Wikibase Item QID'][0]

In [134]:
wikitravelEntityLinking['Jaccard Context Prediction'] = ""
jaccard_context_prediction = []
top_10_pred = []
for index, row in wikitravelEntityLinking.iterrows():
    
    base_document = row['Description']
    documents = wikitravelLocationArticles.iloc[row['Candidate Index Set']]['Article Summary'].tolist()

    jaccard_pred = process_jaccard_similarity(base_document, documents)
#     print('jaccard_pred = ', jaccard_pred)
#     if jaccard_pred == "NIL":
#         jaccard_context_prediction.append(jaccard_pred)
#     else:
    truth_position = 100
    truth = row["Wikibase Item QID"]
    for i, option in enumerate(jaccard_pred):
        pre_qid = wikitravelLocationArticles['Wikibase Item QID'][option]
        if(pre_qid == truth):
            truth_position = i
            break
    top_10_pred.append(truth_position)
    

    pred = wikitravelLocationArticles['Wikibase Item QID'][jaccard_pred[0]]
        
        
#     print('pred = ', pred)
    jaccard_context_prediction.append(pred)
wikitravelEntityLinking['Jaccard Top 10 Prec'] = top_10_pred
end = timer()
print(timedelta(seconds=end-start))

0:06:02.722944


In [135]:
wikitravelEntityLinking['Jaccard Context Prediction'] = jaccard_context_prediction

In [137]:
def check_results_jaccard_context_prediction(x):    
    return True if x['Wikibase Item QID'] == x["Jaccard Context Prediction"] else False
wikitravelEntityLinking['Jaccard Context Prediction Result'] = wikitravelEntityLinking.apply(check_results_jaccard_context_prediction, axis=1)

In [138]:
wikitravelEntityLinking['Jaccard Context Prediction Result'].value_counts()

True     59
False    21
Name: Jaccard Context Prediction Result, dtype: int64

In [139]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'Y']['Jaccard Context Prediction Result'].value_counts()

True     47
False     7
Name: Jaccard Context Prediction Result, dtype: int64

In [140]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'N']['Jaccard Context Prediction Result'].value_counts()

True     12
False     8
Name: Jaccard Context Prediction Result, dtype: int64

In [141]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'NIL']['Jaccard Context Prediction Result'].value_counts()

False    6
Name: Jaccard Context Prediction Result, dtype: int64

# BERT Document Similarity
Now we will use approved state-of-the-art Document embeddings using sentence bert.

In [40]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize

from sentence_transformers import SentenceTransformer

base_document = wikitravelEntityLinking['Description'][2] # Altstadt Düsseldorf
documents = wikitravelLocationArticles.iloc[wikitravelEntityLinking['Candidate Index Set'][2]]['Article Summary'].tolist()

def process_bert_similarity(base_document, documents):
    # This will download and load the pretrained model offered by UKPLab.
    model = SentenceTransformer('bert-base-nli-mean-tokens')

    # Although it is not explicitly stated in the official document of sentence transformer, the original BERT is meant for a shorter sentence. We will feed the model by sentences instead of the whole documents.
    sentences = sent_tokenize(base_document)
    base_embeddings_sentences = model.encode(sentences)
    base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)

    vectors = []
    for i, document in enumerate(documents):

        sentences = sent_tokenize(document)
        embeddings_sentences = model.encode(sentences)
        embeddings = np.mean(np.array(embeddings_sentences), axis=0)
        
        vectors.append(embeddings)

#         print("making vector at index:", i)
        
    scores = cosine_similarity([base_embeddings], vectors).flatten()

    highest_score = 0
    highest_score_index = 0
    for i, score in enumerate(scores):
        if highest_score < score:
            highest_score = score
            highest_score_index = i

    most_similar_document = documents[highest_score_index]
#     print("Most similar document by BERT with the score:", most_similar_document, highest_score)
    most_similar_document_index = wikitravelLocationArticles.index[wikitravelLocationArticles['Article Summary'] == most_similar_document].tolist()[0]
    # print('Index of most similar document ', most_similar_document_index)
#     if highest_score > 0.02: 
    return most_similar_document_index
#     else:
#         return "NIL"

start = timer()
process_bert_similarity(base_document, documents)

4

In [41]:
wikitravelEntityLinking['BERT Prediction'] = ""
BERT_prediction = []
for index, row in wikitravelEntityLinking.iterrows():
    
    base_document = row['Description']
    documents = wikitravelLocationArticles.iloc[row['Candidate Index Set']]['Article Summary'].tolist()

    BERT_pred = process_bert_similarity(base_document, documents)
#     print('jaccard_pred = ', jaccard_pred)
#     if jaccard_pred == "NIL":
#         jaccard_context_prediction.append(jaccard_pred)
#     else:
    pred = wikitravelLocationArticles['Wikibase Item QID'][BERT_pred]
#     print('pred = ', pred)
    BERT_prediction.append(pred)
end = timer()
print(timedelta(seconds=end-start))

0:15:25.868161


In [42]:
wikitravelEntityLinking['BERT Prediction'] = BERT_prediction

In [44]:
def check_results_BERT_prediction(x):    
    return True if x['Wikibase Item QID'] == x["BERT Prediction"] else False
wikitravelEntityLinking['BERT Prediction Result'] = wikitravelEntityLinking.apply(check_results_BERT_prediction, axis=1)

In [45]:
wikitravelEntityLinking['BERT Prediction Result'].value_counts()

True     50
False    30
Name: BERT Prediction Result, dtype: int64

In [46]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'Y']['BERT Prediction Result'].value_counts()

True     41
False    13
Name: BERT Prediction Result, dtype: int64

In [47]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'N']['BERT Prediction Result'].value_counts()

False    11
True      9
Name: BERT Prediction Result, dtype: int64

In [48]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'NIL']['BERT Prediction Result'].value_counts()

False    6
Name: BERT Prediction Result, dtype: int64

# Context Graph Embedding

In [5]:
wikitravelEntityLinking['Context Graph Embedding Prediction'] = ""
context_graph_embedding_prediction = []
for index, row in wikitravelEntityLinking.iterrows():
    
    base_document = row['Description']
    documents = wikitravelLocationArticles.iloc[row['Candidate Index Set']]['Article Summary'].tolist()
    
    context_graph_embedding_prediction_pred = context_graph_embedding(base_document, documents)
    
    if context_graph_embedding_prediction_pred == "NIL":
        context_graph_embedding_prediction_prediction.append(context_graph_embedding_prediction_pred)
    else:
        pred = wikitravelLocationArticles['Wikibase Item QID'][context_graph_embedding_prediction_pred]
        context_graph_embedding_prediction_prediction.append(pred)
end = timer()
print(timedelta(seconds=end-start))

In [31]:
wikitravelEntityLinking['context_graph_embedding_prediction Prediction'] = context_graph_embedding_prediction_prediction

In [33]:
def check_results_context_graph_embedding_prediction_prediction(x):    
    return True if x['Wikibase Item QID'] == x["context_graph_embedding_prediction Prediction"] else False
wikitravelEntityLinking['context_graph_embedding_prediction Prediction Result'] = wikitravelEntityLinking.apply(check_results_context_graph_embedding_prediction_prediction, axis=1)

In [7]:
wikitravelEntityLinking['context_graph_embedding_prediction Prediction Result'].value_counts()

True     52
False    28
Name: context_graph_embedding_prediction Prediction Result, dtype: int64

In [8]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'Y']['context_graph_embedding_prediction Prediction Result'].value_counts()

True     41
False    13
Name: context_graph_embedding_prediction Prediction Result, dtype: int64

In [9]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'N']['context_graph_embedding_prediction Prediction Result'].value_counts()

True     11
False     9
Name: context_graph_embedding_prediction Prediction Result, dtype: int64

In [10]:
wikitravelEntityLinking.loc[wikitravelEntityLinking['Unique Link?'] == 'NIL']['context_graph_embedding_prediction Prediction Result'].value_counts()

False    6
Name: context_graph_embedding_prediction Prediction Result, dtype: int64