In [2]:
import spacy
import pandas as pd
from spacy import displacy
import en_core_web_lg
from pathlib import Path

In [3]:
#! python -m spacy download en_trf_bertbaseuncased_lg

In [4]:
nlp = en_core_web_lg.load()

# Parsing & Part-of-Speech Tags

<b>Text</b>: The original word text.<br>
<b>Lemma</b>: The base form of the word.<br>
<b>POS</b>: The simple UPOS part-of-speech tag.<br>
<b>Tag</b>: The detailed part-of-speech tag.<br>
<b>Dep</b>: Syntactic dependency, i.e. the relation between tokens.<br>
<b>Shape</b>: The word shape – capitalization, punctuation, digits.<br>
<b>is_alpha</b>: Is the token an alpha character?<br>
<b>is_stop</b>: Is the token part of a stop list, i.e. the most common words of the language?<br>

In [5]:
doc = nlp('Darth Vader is also known by his birth name Anakin Skywalker.')
results = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is_alpha', 'is_stop'])

for token in doc:  
    results = results.append({'Text':token.text, 'Lemma':token.lemma_, 'POS':token.pos_, 'Tag':token.tag_, 'Dep':token.dep_, 'Shape':token.shape_, 'is_alpha':token.is_alpha, 'is_stop':token.is_stop}, ignore_index=True)
results

Unnamed: 0,Text,Lemma,POS,Tag,Dep,Shape,is_alpha,is_stop
0,Darth,Darth,PROPN,NNP,compound,Xxxxx,True,False
1,Vader,Vader,PROPN,NNP,nsubjpass,Xxxxx,True,False
2,is,be,AUX,VBZ,auxpass,xx,True,True
3,also,also,ADV,RB,advmod,xxxx,True,True
4,known,know,VERB,VBN,ROOT,xxxx,True,False
5,by,by,ADP,IN,prep,xx,True,True
6,his,-PRON-,DET,PRP$,poss,xxx,True,True
7,birth,birth,NOUN,NN,compound,xxxx,True,False
8,name,name,NOUN,NN,pobj,xxxx,True,True
9,Anakin,Anakin,PROPN,NNP,compound,Xxxxx,True,False


In [6]:
svg = displacy.render(doc, style="dep",jupyter=True)
#output_path = Path("dep.svg")
#output_path.open("w", encoding="utf-8").write(svg)

# Named Entities (NER)

<b>Text</b>: The original entity text.<br>
<b>Start</b>: Index of start of entity in the Doc.<br>
<b>End</b>: Index of end of entity in the Doc.<br>
<b>Label</b>: Entity label, i.e. type.<br>

In [7]:
doc = nlp('Darth Vader is also known by his birth name Anakin Skywalker.')
results = pd.DataFrame(columns=['Text', 'Start', 'End', 'Label'])

for ent in doc.ents:  
    results = results.append({'Text':ent.text, 'Start':ent.start_char, 'End':ent.end_char, 'Label':ent.label_}, ignore_index=True)
results

Unnamed: 0,Text,Start,End,Label
0,Darth Vader,0,11,PERSON
1,Anakin Skywalker,44,60,PERSON


In [8]:
displacy.render(doc, style="ent")

# Larger Text Example

In [9]:
article = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secret husband of Padmé Amidala and grandfather of Kylo Ren., Darth Vader has become one of the most iconic villains in popular culture, and has been listed among the greatest villains and fictional characters ever.56 The American Film Institute listed him as the third greatest movie villain in cinema history on 100 Years... 100 Heroes and Villains, behind Hannibal Lecter and Norman Bates.7 However, other critics consider him a tragic hero, citing his original motivations for the greater good before his fall to the dark side.'
article

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secr

In [10]:
doc = nlp(article)
results = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is_alpha', 'is_stop'])

for token in doc:  
    results = results.append({'Text':token.text, 'Lemma':token.lemma_, 'POS':token.pos_, 'Tag':token.tag_, 'Dep':token.dep_, 'Shape':token.shape_, 'is_alpha':token.is_alpha, 'is_stop':token.is_stop}, ignore_index=True)
results

Unnamed: 0,Text,Lemma,POS,Tag,Dep,Shape,is_alpha,is_stop
0,Darth,Darth,PROPN,NNP,compound,Xxxxx,True,False
1,Vader,Vader,PROPN,NNP,nsubj,Xxxxx,True,False
2,",",",",PUNCT,",",punct,",",False,False
3,also,also,ADV,RB,advmod,xxxx,True,True
4,known,know,VERB,VBN,acl,xxxx,True,False
...,...,...,...,...,...,...,...,...
277,to,to,ADP,IN,prep,xx,True,True
278,the,the,DET,DT,det,xxx,True,True
279,dark,dark,ADJ,JJ,amod,xxxx,True,False
280,side,side,NOUN,NN,pobj,xxxx,True,True


In [11]:
#displacy.render(doc, style="dep", jupyter=True)

# Stanford Open Information Extraction using CoreNLP

https://nlp.stanford.edu/software/openie.html

Extracting {entity1, relation, entity2} triples:

In [12]:
from openie import StanfordOpenIE

In [13]:
with StanfordOpenIE() as client:
    text = 'Darth Vader is also known by his birth name Anakin Skywalker.'
    print('Text: %s.' % text)
    
    for triple in client.annotate(text):
        print(triple)

Text: Darth Vader is also known by his birth name Anakin Skywalker..
Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-535e92b723cd4e6f.props -preload openie
{'subject': 'Darth Vader', 'relation': 'is known by', 'object': 'his birth name Anakin Skywalker'}
{'subject': 'Darth Vader', 'relation': 'is', 'object': 'also known'}
{'subject': 'Darth Vader', 'relation': 'is', 'object': 'known'}
{'subject': 'Darth Vader', 'relation': 'is also known by', 'object': 'his birth name Anakin Skywalker'}


Trying longer document text:

In [14]:
triples = []
with StanfordOpenIE() as client:
    print('Text: %s.' % article)
    for triple in client.annotate(article):
        triples.append(triple)

Text: Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa,

In [15]:
triples[:3]

[{'subject': 'Darth Vader', 'relation': 'is', 'object': 'fictional character'},
 {'subject': 'fictional character',
  'relation': 'is in',
  'object': 'Star Wars franchise'},
 {'subject': 'Vader', 'relation': 'appears in', 'object': 'film trilogy'}]

In [16]:
print(f'Number of Entity-Relation triples: {len(triples)}')

Number of Entity-Relation triples: 89


# Encoding using BERT

https://explosion.ai/blog/spacy-transformers

In [17]:
import spacy
import en_trf_bertbaseuncased_lg

nlp = en_trf_bertbaseuncased_lg.load()

Use word-level embeddings from BERT to add context to tokens:

In [18]:
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")

In [19]:
print(apple1[0].similarity(apple2[0]))
print(apple1[0].similarity(apple3[0]))

0.7342852
0.433657


In [30]:
article_enc = nlp(article)
vader_example = nlp('A popular dialogue from Darth Vader is, Luke I am your father.')

In [31]:
article_enc.similarity(vader_example)

0.7885571923391387

In [43]:
test = ' '.join(x for x in triples[0].values())
article_enc.similarity(nlp(test))

0.7175568134427756

Check cosine similarity of each triple and eliminate based on threshold:

In [52]:
triples_filtered = []
threshold = 0.50
for triple in triples:
    statement = ' '.join(x for x in triple.values())
    similarity = article_enc.similarity(nlp(statement))
    if similarity > threshold:
        triples_filtered.append(triple)
    print(triple, similarity)
    print()

{'subject': 'Darth Vader', 'relation': 'is', 'object': 'fictional character'} 0.7175568134427756

{'subject': 'fictional character', 'relation': 'is in', 'object': 'Star Wars franchise'} 0.6769007695427789

{'subject': 'Vader', 'relation': 'appears in', 'object': 'film trilogy'} 0.6853778819379464

{'subject': 'character', 'relation': 'was', 'object': 'created'} 0.4450941733865092

{'subject': 'Vader', 'relation': 'appears as', 'object': 'pivotal antagonist'} 0.6134172752533587

{'subject': '.123 Vader', 'relation': 'appears in', 'object': 'original film trilogy'} 0.7367025108877702

{'subject': '.123 Vader', 'relation': 'appears as', 'object': 'pivotal antagonist'} 0.6415943665749564

{'subject': 'character', 'relation': 'been portrayed by', 'object': 'numerous actors'} 0.5961205702483965

{'subject': 'Darth Vader', 'relation': 'also known by', 'object': 'his birth name Anakin Skywalker'} 0.7996625754653531

{'subject': 'Vader', 'relation': 'appears as', 'object': 'antagonist'} 0.6105

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations for greater good before', 'object': 'his fall side'} 0.7441029364951708

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations before', 'object': 'his fall side'} 0.716831684026373

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations before', 'object': 'his fall dark side'} 0.732535583795387

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations for good before', 'object': 'his fall dark side'} 0.7323033631627923

{'subject': 'He', 'relation': 'is', 'object': 'father'} 0.3904485663537688

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations for greater good before', 'object': 'his fall dark side'} 0.7531449367336372

{'subject': 'American Film Institute', 'relation': 'listed Hannibal Lecter as', 'object': 'greatest movie villain'} 0.7411301725763856

{'subject': '100 Heroes Villains', 'relation': 'citi

In [53]:
print(len(triples), len(triples_filtered))

89 73


# KG Creation & Exploration

In [79]:
import rdflib

In [80]:
starwars_graph = rdflib.Graph()

In [81]:
for triple in triples_filtered:
    starwars_graph.add((
        rdflib.Literal(triple['subject'], datatype=rdflib.namespace.XSD.string),
        rdflib.Literal(triple['relation'], datatype=rdflib.namespace.XSD.string),
        rdflib.Literal(triple['object'], datatype=rdflib.namespace.XSD.string)
    ))

In [82]:
for s, p, o in starwars_graph:
    print(s, '->', p, '->', o)

100 Heroes Villains -> citing his original motivations for good before -> his fall side
American Film Institute -> listed -> 100 Heroes
American Film Institute -> listed Hannibal Lecter as -> greatest movie villain
100 Heroes Villains -> citing -> his motivations for greater good
other critics -> consider -> him tragic hero
100 Heroes Villains -> citing his original motivations before -> his fall
fictional character -> is in -> Star Wars franchise
his character -> is referenced in -> Star Wars
Vader -> appears as -> pivotal antagonist
he -> been listed among -> greatest villains
character -> been portrayed by -> numerous actors
iconic villains -> is in -> popular culture
100 Heroes Villains -> citing his original motivations for greater good before -> his fall side
.123 Vader -> appears in -> film trilogy
100 Heroes Villains -> citing his motivations for good before -> his fall dark side
American Film Institute -> listed -> Hannibal Lecter However
third greatest movie villain -> is in 

Sample queries:

In [113]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='Darth Vader')
    }
"""

res = starwars_graph.query(query_str)
for s,p,o in res:
    print(s, '->', p, '->', o)

Darth Vader -> is fictional character in -> Star Wars franchise
Darth Vader -> is -> fictional character
Darth Vader -> also known by -> his birth name Anakin Skywalker
Darth Vader -> known by -> his birth name Anakin Skywalker


In [114]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='Vader')
    }
"""

res = starwars_graph.query(query_str)
for s,p,o in res:
    print(s, '->', p, '->', o)

Vader -> appears as -> pivotal antagonist
Vader -> appears as -> antagonist
Vader -> appears in -> film trilogy
Vader -> appears in -> original film trilogy


In [115]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='He')
    }
"""
res = starwars_graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

He -> is character in -> Star Wars
He -> is important character in -> Star Wars
He -> is also father of -> Luke Skywalker
He -> is father of -> Luke Skywalker
He -> is also character in -> Star Wars


### TODO: Need to "contexualize" key entities that group together "He" and "Darth Vader" for example

# starwars.ttl Exploration

In [117]:
graph = rdflib.Graph()
graph.parse('../data/starwars.ttl', format='turtle')

<Graph identifier=Na5101f4fba1e42979f44bfe9db4956bc (<class 'rdflib.graph.Graph'>)>

In [119]:
for s,p,o in graph:
    print(s, '->', p, '->', o)

https://swapi.co/resource/starship/61 -> https://swapi.co/vocabulary/maxAtmospheringSpeed -> 2000
https://swapi.co/resource/planet/41 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Planet
https://swapi.co/resource/starship/17 -> https://swapi.co/vocabulary/consumables -> 6 months
https://swapi.co/resource/human/25 -> https://swapi.co/vocabulary/skinColor -> dark
https://swapi.co/resource/planet/31 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Planet
https://swapi.co/resource/starship/47 -> https://swapi.co/vocabulary/passengers -> 30000
https://swapi.co/resource/starship/17 -> https://swapi.co/vocabulary/crew -> 6
https://swapi.co/resource/vehicle/16 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Vehicle
https://swapi.co/resource/human/51 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/resource/vehicle/18 -> https://swapi.co/vocabulary/vehic

https://swapi.co/resource/quermian/57 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Character
https://swapi.co/resource/planet/5 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/vocabulary/Toydarian -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/5
https://swapi.co/resource/planet/14 -> https://swapi.co/vocabulary/rotationPeriod -> 26
https://swapi.co/resource/planet/45 -> http://www.w3.org/2000/01/rdf-schema#label -> Iridonia
https://swapi.co/resource/vehicle/24 -> https://swapi.co/vocabulary/model -> Modified Luxury Sail Barge
https://swapi.co/resource/yodasspecies/20 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Character
https://swapi.co/resource/planet/23 -> https://swapi.co/vocabulary/desc -> None
https://swapi.co/resource/human/51 -> https://swapi.co/vocabulary/height -> 188.0
https://swapi.co/resource/planet/27 -> https://swapi.co/vocabulary/desc

https://swapi.co/resource/vehicle/14 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Vehicle
https://swapi.co/resource/planet/15 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Planet
https://swapi.co/resource/vehicle/72 -> https://swapi.co/vocabulary/passengers -> 4
https://swapi.co/resource/planet/13 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/resource/iktotchi/56 -> https://swapi.co/vocabulary/homeworld -> https://swapi.co/resource/planet/47
https://swapi.co/resource/vehicle/34 -> https://swapi.co/vocabulary/cargoCapacity -> 12000
https://swapi.co/resource/droid/75 -> https://swapi.co/vocabulary/gender -> female
https://swapi.co/resource/planet/59 -> https://swapi.co/vocabulary/climate -> arid, temperate, tropical
https://swapi.co/vocabulary/Moncalamari -> https://swapi.co/vocabulary/skinColor -> brown
https://swapi.co/resource/starship/41 -> https://swapi.co/vocabulary/m

https://swapi.co/resource/planet/40 -> https://swapi.co/vocabulary/terrain -> desert, tundra, rainforests, mountains
https://swapi.co/resource/film/7 -> https://swapi.co/vocabulary/planet -> https://swapi.co/resource/planet/61
https://swapi.co/resource/human/22 -> https://swapi.co/vocabulary/starship -> https://swapi.co/resource/starship/21
https://swapi.co/vocabulary/Trandoshan -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/2
https://swapi.co/resource/planet/42 -> https://swapi.co/vocabulary/climate -> temperate
https://swapi.co/resource/planet/32 -> https://swapi.co/vocabulary/surfaceWater -> 40
https://swapi.co/vocabulary/Toydarian -> https://swapi.co/vocabulary/averageHeight -> 120.0
https://swapi.co/resource/human/11 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Human
https://swapi.co/resource/starship/41 -> https://swapi.co/vocabulary/crew -> 1
https://swapi.co/resource/planet/4 -> https://swapi.co/vocabulary/orbitalPerio

https://swapi.co/resource/vehicle/55 -> https://swapi.co/vocabulary/desc -> None
https://swapi.co/resource/planet/31 -> http://www.w3.org/2000/01/rdf-schema#label -> Mon Cala
https://swapi.co/resource/starship/66 -> https://swapi.co/vocabulary/cargoCapacity -> 110
https://swapi.co/resource/vehicle/35 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Vehicle
https://swapi.co/resource/clawdite/70 -> https://swapi.co/vocabulary/homeworld -> https://swapi.co/resource/planet/54
https://swapi.co/resource/vehicle/55 -> https://swapi.co/vocabulary/pilot -> https://swapi.co/resource/human/67
https://swapi.co/vocabulary/Nautolan -> https://swapi.co/vocabulary/skinColor -> green
https://swapi.co/resource/mirialan/65 -> https://swapi.co/vocabulary/homeworld -> https://swapi.co/resource/planet/51
https://swapi.co/resource/skakoan/76 -> https://swapi.co/vocabulary/mass -> 48.0
https://swapi.co/resource/film/4 -> https://swapi.co/vocabulary/vehicle -> https://swapi.co/

https://swapi.co/resource/kaleesh/79 -> https://swapi.co/vocabulary/skinColor -> brown, white
https://swapi.co/resource/film/6 -> https://swapi.co/vocabulary/character -> https://swapi.co/resource/human/5
https://swapi.co/resource/vehicle/70 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/resource/human/12 -> https://swapi.co/vocabulary/height -> 180.0
https://swapi.co/resource/planet/27 -> https://swapi.co/vocabulary/gravity -> 1 standard
https://swapi.co/vocabulary/Rodian -> http://www.w3.org/2000/01/rdf-schema#subClassOf -> https://swapi.co/vocabulary/Sentient
https://swapi.co/resource/planet/24 -> https://swapi.co/vocabulary/diameter -> 12150
https://swapi.co/resource/vehicle/33 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/resource/twilek/45 -> https://swapi.co/vocabulary/homeworld -> https://swapi.co/resource/planet/37
https://swapi.co/resource/film/2 -> https://swapi.co/vocabulary/director -> Irv

https://swapi.co/resource/film/1 -> https://swapi.co/vocabulary/character -> https://swapi.co/resource/human/14
https://swapi.co/vocabulary/Hutt -> https://swapi.co/vocabulary/desc -> The Hutts are a fictional alien race in the Star Wars universe. They appear in The Phantom Menace, Return of the Jedi and The Clone Wars, as well as the special edition release of A New Hope. They also appear in various Star Wars games, including those based on the movies, and the Knights of the Old Republic series. None of these are very friendly and all are criminally involved.1 In the comic book series Tales of the Jedi: Golden Age of the Sith and Tales of the Jedi: The Fall of the Sith Empire, however, there is a Hutt character named Aarrba who is sympathetic to the main characters, Gav and Jori Daragon.
https://swapi.co/resource/planet/31 -> https://swapi.co/vocabulary/resident -> https://swapi.co/resource/moncalamari/27
https://swapi.co/vocabulary/Chagrian -> https://swapi.co/vocabulary/character ->

https://swapi.co/vocabulary/Chagrian -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/resource/planet/15 -> https://swapi.co/vocabulary/orbitalPeriod -> 590
https://swapi.co/resource/vehicle/24 -> https://swapi.co/vocabulary/manufacturer -> Ubrikkian Industries Custom Vehicle Division
https://swapi.co/resource/planet/20 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Planet
https://swapi.co/resource/vehicle/26 -> http://www.w3.org/2000/01/rdf-schema#label -> TIE/IN interceptor
https://swapi.co/resource/starship/5 -> https://swapi.co/vocabulary/mglt -> 70
https://swapi.co/resource/planet/8 -> https://swapi.co/vocabulary/climate -> temperate
https://swapi.co/resource/film/6 -> https://swapi.co/vocabulary/planet -> https://swapi.co/resource/planet/19
https://swapi.co/resource/film/2 -> https://swapi.co/vocabulary/openingCrawl -> It is a dark time for the
Rebellion. Although the Death
Star has been destroyed,
I

https://swapi.co/resource/film/2 -> https://swapi.co/vocabulary/character -> https://swapi.co/resource/human/5
https://swapi.co/vocabulary/Skakoan -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Species
https://swapi.co/vocabulary/Kaleesh -> https://swapi.co/vocabulary/averageHeight -> 170.0
https://swapi.co/resource/human/61 -> https://swapi.co/vocabulary/eyeColor -> brown
https://swapi.co/resource/starship/3 -> https://swapi.co/vocabulary/costInCredits -> 150000000
https://swapi.co/resource/planet/43 -> http://www.w3.org/2000/01/rdf-schema#label -> Cerea
https://swapi.co/resource/planet/17 -> https://swapi.co/vocabulary/terrain -> fungus forests
https://swapi.co/resource/vehicle/54 -> https://swapi.co/vocabulary/manufacturer -> Rothana Heavy Engineering
https://swapi.co/resource/besalisk/71 -> http://www.w3.org/2000/01/rdf-schema#label -> Dexter Jettster
https://swapi.co/resource/droid/8 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://s

https://swapi.co/resource/vehicle/4 -> https://swapi.co/vocabulary/vehicleClass -> wheeled
https://swapi.co/resource/human/34 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/4
https://swapi.co/resource/starship/27 -> https://swapi.co/vocabulary/mglt -> 60
https://swapi.co/resource/twilek/46 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Twilek
https://swapi.co/resource/starship/27 -> https://swapi.co/vocabulary/crew -> 5400
https://swapi.co/resource/aleena/47 -> https://swapi.co/vocabulary/hairColor -> none
https://swapi.co/resource/starship/5 -> https://swapi.co/vocabulary/manufacturer -> Sienar Fleet Systems, Cyngus Spaceworks
https://swapi.co/resource/toong/50 -> https://swapi.co/vocabulary/eyeColor -> orange
https://swapi.co/resource/gungan/36 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Character
https://swapi.co/resource/planet/60 -> https://swapi.co/vocabulary/desc -> None
https://swapi

https://swapi.co/resource/starship/41 -> https://swapi.co/vocabulary/hyperdriveRating -> 1.5
https://swapi.co/resource/planet/47 -> https://swapi.co/vocabulary/rotationPeriod -> 22
https://swapi.co/resource/human/34 -> http://www.w3.org/1999/02/22-rdf-syntax-ns#type -> https://swapi.co/vocabulary/Character
https://swapi.co/resource/mirialan/64 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/6
https://swapi.co/resource/film/3 -> https://swapi.co/vocabulary/vehicle -> https://swapi.co/resource/vehicle/8
https://swapi.co/resource/planet/3 -> https://swapi.co/vocabulary/film -> https://swapi.co/resource/film/1
https://swapi.co/resource/xexto/49 -> https://swapi.co/vocabulary/homeworld -> https://swapi.co/resource/planet/40
https://swapi.co/vocabulary/Yodasspecies -> http://www.w3.org/2000/01/rdf-schema#subClassOf -> https://swapi.co/vocabulary/Sentient
https://swapi.co/resource/human/34 -> http://www.w3.org/2000/01/rdf-schema#label -> Finis Valorum
https://swapi.co/re

https://swapi.co/resource/planet/3 -> https://swapi.co/vocabulary/altDesc -> Yavin (also known as Yavin Prime) is a fictional planet in the Star Wars universe. It first appeared in the 1977 film Star Wars and is depicted as a large red gas giant with an extensive satellite system of moons. Within the Star Wars narrative, Yavin is noted as the hidden military base of the Rebel Alliance located on its fourth moon, known as Yavin\xa0IV., The climactic space battle at the end of the film, in which the Rebel Alliance destroys the Death Star, takes place in orbit around the planet Yavin. In Star Wars fandom and the Star Wars expanded universe, this event is especially significant as it is used to mark an epoch in the fictional Star Wars universe. Events in Star Wars stories are typically dated in terms of years BBY (Before the Battle of Yavin) or ABY (After the Battle of Yavin).12
https://swapi.co/resource/planet/20 -> https://swapi.co/vocabulary/gravity -> 1 standard
https://swapi.co/resour

# Parse articles from Wikipedia and construct KG

https://stackabuse.com/getting-started-with-pythons-wikipedia-api/

In [127]:
import wikipedia

In [133]:
wikipedia.search('Millennium Falcon')

['Millennium Falcon',
 'Star Wars: Millennium Falcon – Smugglers Run',
 'Millennium Falcon (novel)',
 'Chewbacca',
 'Falcon 9',
 'Berserk: Millennium Falcon Hen Seima Senki no Shō',
 'Lando Calrissian',
 'Solo: A Star Wars Story',
 'Han Solo',
 "Star Wars: Galaxy's Edge"]

In [134]:
wikipedia.summary('Millennium Falcon')

"The Millennium Falcon is a fictional starship in the Star Wars franchise. Designed by Joe Johnston for the movie Star Wars (1977), it has subsequently appeared in The Star Wars Holiday Special (1978), The Empire Strikes Back (1980), Return of the Jedi (1983), The Force Awakens (2015), The Last Jedi (2017), Solo: A Star Wars Story (2018), and The Rise of Skywalker (2019). The starship, or a similar one, also has a cameo in Revenge of the Sith (2005). Additionally, the Falcon appears in a variety of Star Wars expanded universe materials, including books, comics, and games; James Luceno's novel Millennium Falcon focuses on the titular ship. It also appears in the 2014 animated film The Lego Movie in Lego form, with Billy Dee Williams and Anthony Daniels reprising their roles of Lando Calrissian and C-3PO, and Keith Ferguson voicing Han Solo.\nSolo: A Star Wars Story depicts the YT-1300 Corellian light freighter being embodied by L3-37 (Phoebe Waller-Bridge). The ship is primarily command

In [136]:
wikipedia.page('Millennium Falcon').content

'The Millennium Falcon is a fictional starship in the Star Wars franchise. Designed by Joe Johnston for the movie Star Wars (1977), it has subsequently appeared in The Star Wars Holiday Special (1978), The Empire Strikes Back (1980), Return of the Jedi (1983), The Force Awakens (2015), The Last Jedi (2017), Solo: A Star Wars Story (2018), and The Rise of Skywalker (2019). The starship, or a similar one, also has a cameo in Revenge of the Sith (2005). Additionally, the Falcon appears in a variety of Star Wars expanded universe materials, including books, comics, and games; James Luceno\'s novel Millennium Falcon focuses on the titular ship. It also appears in the 2014 animated film The Lego Movie in Lego form, with Billy Dee Williams and Anthony Daniels reprising their roles of Lando Calrissian and C-3PO, and Keith Ferguson voicing Han Solo.\nSolo: A Star Wars Story depicts the YT-1300 Corellian light freighter being embodied by L3-37 (Phoebe Waller-Bridge). The ship is primarily comman

In [137]:
mfalcon_article = wikipedia.page('Millennium Falcon').content

In [146]:
def text_to_graph(article, similarity_threshold):
    # Convert to triples
    triples = []
    with StanfordOpenIE() as client:
        for triple in client.annotate(article):
            triples.append(triple)
    print(f'Num of Triples: {len(triples)}')
    
    # Load BERT
    nlp = en_trf_bertbaseuncased_lg.load()
    article_enc = nlp(article)
    
    # Similarity Thresholding
    triples_filtered = []
    for triple in triples:
        statement = ' '.join(x for x in triple.values())
        similarity = article_enc.similarity(nlp(statement))
        if similarity > similarity_threshold:
            triples_filtered.append(triple)
    print(f'Filtered Triples: {len(triples_filtered)}')
    
    # Need to add step here for contexualization
    
    # Create RDF graph
    graph = Graph()
    for triple in triples_filtered:
        graph.add((
            rdflib.Literal(triple['subject'], datatype=rdflib.namespace.XSD.string),
            rdflib.Literal(triple['relation'], datatype=rdflib.namespace.XSD.string),
            rdflib.Literal(triple['object'], datatype=rdflib.namespace.XSD.string)
        ))
    
    return graph
    

In [147]:
graph = text_to_graph(article=mfalcon_article, similarity_threshold=0.75)

Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-712f785371b34522.props -preload openie
Num of Triples: 1008
Filtered Triples: 65


In [149]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
    }
"""
res = graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

Star Wars Story -> depicts -> YT-1300 light freighter
original model -> used as -> Princess Leia 's ship
Chewbacca -> take possession after -> Solo 's hibernation
Hot Wheels playset.Lego -> has released -> multiple versions of Millennium Falcon
Calrissian captains -> is with -> Nien Nunb as co-pilot
Hot Wheels playset.Lego -> has released -> versions of Millennium Falcon
Lego -> released -> Millennium Falcon set in Star Wars Ultimate Collectors Series
it -> has subsequently appeared in -> Return of Jedi
Obi-Wan Kenobi -> charter ship In -> Star Wars
Chewbacca -> Back take possession after -> Solo 's hibernation by Empire
Creating Lando -> has -> Millennium Falcon for Solo Behind Magic of Kessel Run Millennium Falcon on Wookieepedia
James Luceno 's Millennium Falcon -> focuses on -> ship
Chewbacca -> Back take possession after -> Solo 's hibernation in carbonite by Empire
Star Wars Story -> depicts -> YT-1300 freighter
version -> appears -> briefly on Coruscant in Revenge of Sith
update