### Entity Linking with spaCy

In [3]:
import rdflib
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import FOAF , XSD, Namespace
import pandas as pd
import spacy
from spacy.kb import KnowledgeBase
import os
import csv
import re
import random

In [4]:
nlp = spacy.load("en_core_web_lg")

In [3]:
! python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/Users/rhythmsyed/miniconda3/envs/entitylink/lib/python3.7/site-packages/spacy[0m

TYPE      NAME                        MODEL                       VERSION                            
package   en-vectors-web-lg           en_vectors_web_lg           [38;5;1m2.3.0[0m   --> 2.1.0     
package   en-trf-bertbaseuncased-lg   en_trf_bertbaseuncased_lg   [38;5;1m2.3.0[0m   --> 2.2.0     
package   en-core-web-sm              en_core_web_sm              [38;5;1m2.1.0[0m   --> 2.2.5     
package   en-core-web-lg              en_core_web_lg              [38;5;2m2.2.5[0m   [38;5;2m✔[0m

[1m
Use the following commands to update the model packages:
python -m spacy download en_core_web_sm
python -m spacy download en_trf_bertbaseuncased_lg
python -m spacy download en_vectors_web_lg



### Skywalker is ambiguous

In [8]:
starwars_text = 'Skywalker, also known as Darth Vader, is a fictional character in the Star Wars franchise'
starwars_text

'Skywalker, also known as Darth Vader, is a fictional character in the Star Wars franchise'

In [9]:
doc = nlp(starwars_text)
for ent in doc.ents:
    print(f"Named Entity '{ent.text}' with label '{ent.label_}'")

Named Entity 'Skywalker' with label 'ORG'
Named Entity 'Darth Vader' with label 'PERSON'


### Get names related to 'Skywalker' 

In [10]:
graph = rdflib.Graph()
graph.parse('./data/starwars.ttl', format='turtle')

<Graph identifier=Ncf4b9d9f386743bdbde28c54617911b0 (<class 'rdflib.graph.Graph'>)>

In [11]:
for ent in doc.ents:
    print(ent)
    query_str = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?s ?o
        WHERE {   
            ?s rdfs:label ?o.
            FILTER regex(?o, "%s")
        }
    """ % ent
    res = graph.query(query_str)
    print(list(res))
    print()

Skywalker
[(rdflib.term.URIRef('https://swapi.co/resource/human/11'), rdflib.term.Literal('Anakin Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))), (rdflib.term.URIRef('https://swapi.co/resource/human/43'), rdflib.term.Literal('Shmi Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))), (rdflib.term.URIRef('https://swapi.co/resource/human/1'), rdflib.term.Literal('Luke Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))]

Darth Vader
[(rdflib.term.URIRef('https://swapi.co/resource/human/4'), rdflib.term.Literal('Darth Vader', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))]



In [21]:
query_str = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX voc: <https://swapi.co/vocabulary/>
    PREFIX xml: <http://www.w3.org/XML/1998/namespace>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>


    SELECT ?s ?o
    WHERE {
        ?s rdfs:label ?o.
        FILTER regex(?o, "Skywalker")
    }
"""
res = graph.query(query_str)
list(res)

[(rdflib.term.URIRef('https://swapi.co/resource/human/11'),
  rdflib.term.Literal('Anakin Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))),
 (rdflib.term.URIRef('https://swapi.co/resource/human/43'),
  rdflib.term.Literal('Shmi Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))),
 (rdflib.term.URIRef('https://swapi.co/resource/human/1'),
  rdflib.term.Literal('Luke Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))]

Throw out candidates that do not have a description

In [23]:
query_str = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX voc: <https://swapi.co/vocabulary/>
    PREFIX xml: <http://www.w3.org/XML/1998/namespace>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>


    SELECT ?s ?p ?o
    WHERE {   
        ?s <https://swapi.co/vocabulary/desc> ?o.
        FILTER (?s=<https://swapi.co/resource/human/1>)
    }
"""
res = graph.query(query_str)
list(res)

[(rdflib.term.URIRef('https://swapi.co/resource/human/1'),
  None,
  rdflib.term.Literal('Luke Skywalker is a fictional character and the main protagonist of the original film trilogy of the Star Wars franchise created by George Lucas. The character, portrayed by Mark Hamill, is an important figure in the Rebel Alliance\\s struggle against the Galactic Empire. He is the twin brother of Rebellion leader Princess Leia Organa of Alderaan, a friend and brother-in-law of smuggler Han Solo, an apprentice to Jedi Masters Obi-Wan "Ben" Kenobi and Yoda, the son of fallen Jedi Anakin Skywalker (Darth Vader) and Queen of Naboo/Republic Senator Padmé Amidala and maternal uncle of Kylo Ren / Ben Solo. The now non-canon Star Wars expanded universe depicts him as a powerful Jedi Master, husband of Mara Jade, the father of Ben Skywalker and maternal uncle of Jaina, Jacen and Anakin Solo., In 2015, the character was selected by Empire magazine as the 50th greatest movie character of all time.2 On their

In [10]:
entities = pd.read_csv('./entity_linking_data/entities.txt', header=None, delimiter='|')
entities

Unnamed: 0,0,1,2
0,Human1,Luke Skywalker,Luke Skywalker is a fictional character and th...
1,Human11,Anakin Skywalker,"Darth Vader, also known by his birth name Anak..."
2,Human43,Shmi Skywalker,The mother of Anakin Skywalker was also a brav...


In [11]:
names = {}
descriptions = {}

for i, row in entities.iterrows():
    qid = row[0]
    name = row[1]
    desc = row[2]
    names[qid] = name
    descriptions[qid] = desc

for qid in names.keys():
    print(f"qid={qid}, name={names[qid]}, desc={descriptions[qid]}")

qid=Human1, name=Luke Skywalker, desc=Luke Skywalker is a fictional character and the main protagonist of the original film trilogy of the Star Wars franchise created by George Lucas. The character, portrayed by Mark Hamill, is an important figure in the Rebel Alliance\\s struggle against the Galactic Empire. He is the twin brother of Rebellion leader Princess Leia Organa of Alderaan, a friend and brother-in-law of smuggler Han Solo, an apprentice to Jedi Masters Obi-Wan Ben" Kenobi and Yoda, the son of fallen Jedi Anakin Skywalker (Darth Vader) and Queen of Naboo/Republic Senator Padmé Amidala and maternal uncle of Kylo Ren / Ben Solo. The now non-canon Star Wars expanded universe depicts him as a powerful Jedi Master, husband of Mara Jade, the father of Ben Skywalker and maternal uncle of Jaina, Jacen and Anakin Solo., In 2015, the character was selected by Empire magazine as the 50th greatest movie character of all time.2 On their list of the 100 Greatest Fictional Characters, Fando

### Create spaCy KB

In [12]:
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300)

In [13]:
for qid, desc in descriptions.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    print(desc_enc)
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)
len(kb)

[ 1.64877053e-03  3.77046652e-02  5.96367754e-03 -5.53854965e-02
  7.98609853e-02 -6.66870475e-02  6.82777986e-02 -1.11688018e-01
  1.85674485e-02  1.62822092e+00 -1.16596244e-01 -9.74116102e-02
 -2.29512099e-02 -3.45174596e-02  6.73938021e-02 -8.70725587e-02
  2.79771425e-02  6.24269247e-01 -1.38549373e-01 -2.87350845e-02
 -1.68803688e-02 -1.49233624e-01 -6.45916462e-02 -1.28831773e-03
  5.45581132e-02  1.98846422e-02 -6.17490001e-02  3.04645821e-02
 -3.81340161e-02  1.52793646e-01 -1.62351709e-02  1.34893492e-01
 -9.27841216e-02  8.85419026e-02  7.03666955e-02 -1.05042279e-01
 -9.47420374e-02 -5.08919172e-02 -2.00832095e-02 -5.27302809e-02
  1.32045671e-02  6.32223263e-02 -1.21270446e-02 -4.83484305e-02
  5.68462349e-02  2.16762777e-02 -6.54969141e-02  8.30268040e-02
  4.94984761e-02 -3.19121294e-02  1.34552777e-01  1.01578012e-02
  1.95301063e-02  8.47533904e-03  2.00974885e-02  4.20924164e-02
  5.50460853e-02  2.29815021e-03  7.81608224e-02 -8.20094496e-02
 -3.50226904e-03 -1.14277

3

In [14]:
for qid, name in names.items():
    print(qid, name)
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])

Human1 Luke Skywalker
Human11 Anakin Skywalker
Human43 Shmi Skywalker


In [15]:
qids = names.keys()
probs = [1/len(qids) for qid in qids]
kb.add_alias(alias="Skywalker", entities=qids, probabilities=probs)

7791551801117974099

In [16]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['Human1', 'Human11', 'Human43']
Aliases in the KB: ['Skywalker', 'Shmi Skywalker', 'Anakin Skywalker', 'Luke Skywalker']


In [17]:
print(f"Candidates for 'Shmi Skywalker': {[c.entity_ for c in kb.get_candidates('Shmi Skywalker')]}")
print(f"Candidates for 'Anakin Skywalker': {[c.entity_ for c in kb.get_candidates('Anakin Skywalker')]}")
print(f"Candidates for 'Luke Skywalker': {[c.entity_ for c in kb.get_candidates('Luke Skywalker')]}")
print(f"Candidates for 'Skywalker': {[c.entity_ for c in kb.get_candidates('Skywalker')]}")
print(f"Candidates for 'R2-D2': {[c.entity_ for c in kb.get_candidates('R2-D2')]}")

Candidates for 'Shmi Skywalker': ['Human43']
Candidates for 'Anakin Skywalker': ['Human11']
Candidates for 'Luke Skywalker': ['Human1']
Candidates for 'Skywalker': ['Human1', 'Human11', 'Human43']
Candidates for 'R2-D2': []


In [18]:
label_to_name = {
    'Human11': 'Anakin Skywalker',
    'Human1': 'Luke Skywalker',
    'Human43': 'Shmi Skywalker'
}

### Save KB

In [19]:
output_dir = './entity_linking_data'
if not os.path.exists(output_dir):
    os.mkdir(output_dir) 
kb.dump(output_dir + "/my_kb")

In [20]:
nlp.to_disk(output_dir + "/my_nlp")

### Train EntityLinker: Create Training/Testing Datasets

Creating training data by scraping sentences with 'Skywalker' from internet

> https://starwars.fandom.com/wiki/Shmi_Skywalker_Lars

In [32]:
corpus = []
labels = []

with open ('./entity_linking_data/skywalker_input_data.txt', 'r') as f:
    for row in csv.reader(f,delimiter=','):
        text, label = row[0].strip(), row[1].strip()
        corpus.append(text)
        labels.append(label)

print(corpus[0], labels[0])
print(corpus[-1], labels[-1])

His alter ego, Darth Vader, the Dark Lord of the Sith, was created when Skywalker turned to the dark side of the Force, pledging his allegiance to the Sith Lord Darth Sidious at the end of the Republic Era. Human11
Shortly after Skywalker's death, her son went on to become a Jedi Knight, as well as a general in the Grand Army of the Republic. Human43


> Training data must be representative of ambiguous entities after running NER

In [58]:
doc = nlp(corpus[2])
print(doc.ents[0].start_char, doc.ents[0].end_char)

0 16


In [42]:
for idx, i in enumerate(corpus):
    doc = nlp(corpus[idx])
    doc.ents.index
    print(idx+1, doc.ents)

1 (Darth Vader, Skywalker, Darth Sidious, the Republic Era)
2 (Skywalker, Tatooine, the Outer Rim Territories, 41)
3 (Anakin Skywalker, Shmi)
4 (the early days, Skywalker, 501st Legion, the Confederacy of Independent Systems)
5 (Skywalker, Padmé Amidala, Naboo)
6 (19, Skywalker, Supreme, Sheev Palpatine, Darth Sidious)
7 (Skywalker, Amidala, the Jedi Order, Sidious)
8 (the Chosen One, Skywalker, one, Force)
9 (Skywalker, one, Jedi and Sith)
10 (Skywalker, Tatooine, Shmi, Skywalker)
11 (Skywalker, Jedi Master, the Galactic Civil War, the Galactic Empire)
12 (Leia Organa, Han Solo, Skywalker, the Alliance to Restore the Republic, the Galactic Empire)
13 (Skywalker, Rebellion, Obi-Wan Kenobi)
14 (Skywalker, the Battle of Endor, 4, ABY)
15 (Vader, Skywalker)
16 (Skywalker, Jakku, Rey)
17 (Skywalker, Rey, Darth Sidious)
18 (Skywalker, Rey, Jedi)
19 (Skywalker, Lars)
20 (Tikaroo, Skywalker, Kivas, Devaron)
21 (Skywalker,)
22 (Hutts, Skywalker, Zygerrian, Anakin)
23 (Skywalker,)
24 (Skywalker

In [45]:
dataset = []
for index, text in enumerate(corpus):
    offset = re.search(r'Skywalker', text).span()
    dataset.append((text, {"links": {offset: {labels[index]:1.0}}}))

print(dataset[2])

('Anakin Skywalker was the son of Shmi, a slave who conceived a child without a father.', {'links': {(7, 16): {'Human11': 1.0}}})


In [35]:
gold_ids = []
for text, annot in dataset:
    for span, links_dict in annot["links"].items():
        for link, value in links_dict.items():
            if value:
                gold_ids.append(link)

from collections import Counter
print(Counter(gold_ids))

Counter({'Human11': 10, 'Human1': 10, 'Human43': 10})


In [36]:
train_dataset = []
test_dataset = []
for QID in qids:
    indices = [i for i, j in enumerate(gold_ids) if j == QID]
    train_dataset.extend(dataset[index] for index in indices[0:8])  # first 8 in training
    test_dataset.extend(dataset[index] for index in indices[8:10])  # last 2 in test
    
random.shuffle(train_dataset)
random.shuffle(test_dataset)

In [37]:
TRAIN_DOCS = []
for text, annotation in train_dataset:
    doc = nlp(text)
    TRAIN_DOCS.append((doc, annotation))

In [38]:
TRAIN_DOCS[:5]

[(A vergence in the Force, Skywalker was born on the desert planet of Tatooine in the Outer Rim Territories in 41 BBY.,
  {'links': {(25, 34): {'Human11': 1.0}}}),
 (Skywalker was a peaceful, selfless and kind person.,
  {'links': {(0, 9): {'Human43': 1.0}}}),
 (While Skywalker was caring and compassionate, his fear of losing Amidala in childbirth caused him to turn against the Jedi Order, believing Sidious had the knowledge to cheat death.,
  {'links': {(6, 15): {'Human11': 1.0}}}),
 (Having fulfilled his destiny as the Chosen One, Skywalker made peace with his son and became one with the Force.,
  {'links': {(48, 57): {'Human11': 1.0}}}),
 (After his master's death, Skywalker participated in the Battle of Endor in 4 ABY.,
  {'links': {(26, 35): {'Human1': 1.0}}})]

### Create Entity_Linker pipeline

In [39]:
entity_linker = nlp.create_pipe("entity_linker", config={"incl_prior": False})
entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True)

ValueError: [E007] 'entity_linker' already exists in pipeline. Existing names: ['tagger', 'parser', 'ner', 'entity_linker']

In [40]:
from spacy.util import minibatch, compounding

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
with nlp.disable_pipes(*other_pipes):   # train only the entity_linker
    optimizer = nlp.begin_training()
    for itn in range(500):   # 500 iterations takes about a minute to train
        random.shuffle(TRAIN_DOCS)
        batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))  # increasing batch sizes
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  
                annotations,   
                drop=0.5,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)   # print the training loss
print(itn, "Losses", losses)

RuntimeError: [E188] Could not match the gold entity links to entities in the doc - make sure the gold EL data refers to valid results of the named entity recognizer in the `nlp` pipeline.

In [31]:
other_pipes

['tagger', 'parser', 'ner']

### Test the EL Model

In [59]:
text = 'Skywalker, also known as Darth Vader, is a fictional character in the Star Wars franchise'
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_, str(label_to_name.get(ent.kb_id_)))

Skywalker ORG Human11 Anakin Skywalker
Darth Vader PERSON NIL None


> Explain why Darth Vader is None (not trained)

### Test on Training Data

In [60]:
correct = 0
for text, true_annot in train_dataset:
    print(text)
    print(f"Gold annotation: {true_annot}")
    doc = nlp(text)
    for ent in doc.ents:
        if ent.text == "Skywalker":
            print(f"Prediction: {ent.text}, {ent.label_}, {ent.kb_id_}, {str(label_to_name.get(ent.kb_id_))}")
            if ent.kb_id_ == list(true_annot['links'][list(true_annot['links'].keys())[0]].keys())[0]:
                correct += 1
    print()
print('Correct: {} out of {}, Accuracy: {}'.format(correct, len(train_dataset), correct/len(train_dataset)))

A vergence in the Force, Skywalker was born on the desert planet of Tatooine in the Outer Rim Territories in 41 BBY.
Gold annotation: {'links': {(25, 34): {'Human11': 1.0}}}
Prediction: Skywalker, PERSON, Human11, Anakin Skywalker

Skywalker was a peaceful, selfless and kind person.
Gold annotation: {'links': {(0, 9): {'Human43': 1.0}}}
Prediction: Skywalker, ORG, Human43, Shmi Skywalker

While Skywalker was caring and compassionate, his fear of losing Amidala in childbirth caused him to turn against the Jedi Order, believing Sidious had the knowledge to cheat death.
Gold annotation: {'links': {(6, 15): {'Human11': 1.0}}}
Prediction: Skywalker, PERSON, Human11, Anakin Skywalker

Having fulfilled his destiny as the Chosen One, Skywalker made peace with his son and became one with the Force.
Gold annotation: {'links': {(48, 57): {'Human11': 1.0}}}
Prediction: Skywalker, PERSON, Human11, Anakin Skywalker

After his master's death, Skywalker participated in the Battle of Endor in 4 ABY.
Go

In [61]:
correct = 0
for text, true_annot in test_dataset:
    print(text)
    print(f"Gold annotation: {true_annot}")
    doc = nlp(text)
    for ent in doc.ents:
        if ent.text == "Skywalker":
            print(f"Prediction: {ent.text}, {ent.label_}, {ent.kb_id_}, {str(label_to_name.get(ent.kb_id_))}")
            if ent.kb_id_ == list(true_annot['links'][list(true_annot['links'].keys())[0]].keys())[0]:
                correct += 1
    print()
print('Correct: {} out of {}, Accuracy: {}'.format(correct, len(test_dataset), correct/len(test_dataset)))

Shortly after Skywalker's death, her son went on to become a Jedi Knight, as well as a general in the Grand Army of the Republic.
Gold annotation: {'links': {(14, 23): {'Human43': 1.0}}}
Prediction: Skywalker, PERSON, Human11, Anakin Skywalker

Although Skywalker was born on the desert planet of Tatooine, some sources stated that Shmi and he moved to the planet when Skywalker was at a very young age.
Gold annotation: {'links': {(9, 18): {'Human11': 1.0}}}
Prediction: Skywalker, PERSON, Human43, Shmi Skywalker
Prediction: Skywalker, PERSON, Human43, Shmi Skywalker

Having seen her son again, Skywalker told him she was complete but died in his arms.
Gold annotation: {'links': {(27, 36): {'Human43': 1.0}}}
Prediction: Skywalker, PERSON, Human1, Luke Skywalker

Skywalker spent his youth on the Lars' moisture farm, where his uncle unsuccessfully tried to discourage his adventurous tendencies, telling him that his father had been a navigator on a spice freighter" during the Clone Wars."
Gold

### Save Model

In [None]:
output_dir = './entity_linking_data'
nlp.to_disk(output_dir + "/trained_el")

### Load and Predict

In [3]:
output_dir = './entity_linking_data'
nlp = spacy.load(output_dir + "/trained_el")

In [5]:
text = 'Skywalker, also known as Darth Vader, is a fictional character in the Star Wars franchise'
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_, str(label_to_name.get(ent.kb_id_)))

KeyboardInterrupt: 

In [13]:
print(str(1) + ',' + str(2))
print(*[1,2,3], sep=',')
**kwargs

1,2
1,2,3


### Entity Linking Use Case

In [64]:
text = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secret husband of Padmé Amidala and grandfather of Kylo Ren., Darth Vader has become one of the most iconic villains in popular culture, and has been listed among the greatest villains and fictional characters ever.56 The American Film Institute listed him as the third greatest movie villain in cinema history on 100 Years... 100 Heroes and Villains, behind Hannibal Lecter and Norman Bates.7 However, other critics consider him a tragic hero, citing his original motivations for the greater good before his fall to the dark side.'
text

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secr

In [65]:
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_, str(label_to_name.get(ent.kb_id_)))

Darth Vader PERSON NIL None
Anakin Skywalker PERSON Human11 Anakin Skywalker
Anakin Skywalker PERSON Human11 Anakin Skywalker
George Lucas PERSON NIL None
first ORDINAL NIL None
six CARDINAL NIL None
Star Wars: The Force Awakens WORK_OF_ART NIL None
Star ORG NIL None
Force ORG NIL None
Emperor Palpatine PERSON NIL None
Darth PERSON NIL None
Luke Skywalker PERSON Human1 Luke Skywalker
Leia Organa PERSON NIL None
Padmé Amidala PERSON NIL None
Kylo Ren PERSON NIL None
Darth Vader PERSON NIL None
The American Film Institute ORG NIL None
third ORDINAL NIL None
100 Years DATE NIL None
100 CARDINAL NIL None
Hannibal Lecter PERSON NIL None
Norman Bates.7 PERSON NIL None


> Regex is a bottleneck

In [66]:
for ent in doc.ents:
    print(ent)
    query_str = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?s ?o
        WHERE {   
            ?s rdfs:label ?o.
            FILTER regex(?o, "%s")
        }
    """ % ent
    res = graph.query(query_str)
    print(list(res))
    print()

Darth Vader
[(rdflib.term.URIRef('https://swapi.co/resource/human/4'), rdflib.term.Literal('Darth Vader', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))]

Anakin Skywalker
[(rdflib.term.URIRef('https://swapi.co/resource/human/11'), rdflib.term.Literal('Anakin Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))]

Anakin Skywalker
[(rdflib.term.URIRef('https://swapi.co/resource/human/11'), rdflib.term.Literal('Anakin Skywalker', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))]

George Lucas
[]

first
[]

six
[]

Star Wars: The Force Awakens
[]

Star
[(rdflib.term.URIRef('https://swapi.co/resource/starship/3'), rdflib.term.Literal('Star Destroyer', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))), (rdflib.term.URIRef('https://swapi.co/resource/starship/9'), rdflib.term.Literal('Death Star', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))), (rdflib.term.URIRef