In [1]:
import spacy
import pandas as pd
from spacy import displacy
import en_core_web_lg
from pathlib import Path

In [2]:
#! python -m spacy download en_trf_bertbaseuncased_lg

In [3]:
nlp = en_core_web_lg.load()

# Parsing & Part-of-Speech Tags

<b>Text</b>: The original word text.<br>
<b>Lemma</b>: The base form of the word.<br>
<b>POS</b>: The simple UPOS part-of-speech tag.<br>
<b>Tag</b>: The detailed part-of-speech tag.<br>
<b>Dep</b>: Syntactic dependency, i.e. the relation between tokens.<br>
<b>Shape</b>: The word shape – capitalization, punctuation, digits.<br>
<b>is_alpha</b>: Is the token an alpha character?<br>
<b>is_stop</b>: Is the token part of a stop list, i.e. the most common words of the language?<br>

In [4]:
doc = nlp('Darth Vader is also known by his birth name Anakin Skywalker.')
results = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is_alpha', 'is_stop'])

for token in doc:  
    results = results.append({'Text':token.text, 'Lemma':token.lemma_, 'POS':token.pos_, 'Tag':token.tag_, 'Dep':token.dep_, 'Shape':token.shape_, 'is_alpha':token.is_alpha, 'is_stop':token.is_stop}, ignore_index=True)
results

Unnamed: 0,Text,Lemma,POS,Tag,Dep,Shape,is_alpha,is_stop
0,Darth,Darth,PROPN,NNP,compound,Xxxxx,True,False
1,Vader,Vader,PROPN,NNP,nsubjpass,Xxxxx,True,False
2,is,be,AUX,VBZ,auxpass,xx,True,True
3,also,also,ADV,RB,advmod,xxxx,True,True
4,known,know,VERB,VBN,ROOT,xxxx,True,False
5,by,by,ADP,IN,prep,xx,True,True
6,his,-PRON-,DET,PRP$,poss,xxx,True,True
7,birth,birth,NOUN,NN,compound,xxxx,True,False
8,name,name,NOUN,NN,pobj,xxxx,True,True
9,Anakin,Anakin,PROPN,NNP,compound,Xxxxx,True,False


In [5]:
svg = displacy.render(doc, style="dep",jupyter=True)
#output_path = Path("dep.svg")
#output_path.open("w", encoding="utf-8").write(svg)

# Named Entities (NER)

<b>Text</b>: The original entity text.<br>
<b>Start</b>: Index of start of entity in the Doc.<br>
<b>End</b>: Index of end of entity in the Doc.<br>
<b>Label</b>: Entity label, i.e. type.<br>

In [6]:
doc = nlp('Darth Vader is also known by his birth name Anakin Skywalker.')
results = pd.DataFrame(columns=['Text', 'Start', 'End', 'Label'])

for ent in doc.ents:  
    results = results.append({'Text':ent.text, 'Start':ent.start_char, 'End':ent.end_char, 'Label':ent.label_}, ignore_index=True)
results

Unnamed: 0,Text,Start,End,Label
0,Darth Vader,0,11,PERSON
1,Anakin Skywalker,44,60,PERSON


In [7]:
displacy.render(doc, style="ent")

# Larger Text Example

In [8]:
article = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secret husband of Padmé Amidala and grandfather of Kylo Ren., Darth Vader has become one of the most iconic villains in popular culture, and has been listed among the greatest villains and fictional characters ever.56 The American Film Institute listed him as the third greatest movie villain in cinema history on 100 Years... 100 Heroes and Villains, behind Hannibal Lecter and Norman Bates.7 However, other critics consider him a tragic hero, citing his original motivations for the greater good before his fall to the dark side.'
article

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secr

In [9]:
doc = nlp(article)
results = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is_alpha', 'is_stop'])

for token in doc:  
    results = results.append({'Text':token.text, 'Lemma':token.lemma_, 'POS':token.pos_, 'Tag':token.tag_, 'Dep':token.dep_, 'Shape':token.shape_, 'is_alpha':token.is_alpha, 'is_stop':token.is_stop}, ignore_index=True)
results

Unnamed: 0,Text,Lemma,POS,Tag,Dep,Shape,is_alpha,is_stop
0,Darth,Darth,PROPN,NNP,compound,Xxxxx,True,False
1,Vader,Vader,PROPN,NNP,nsubj,Xxxxx,True,False
2,",",",",PUNCT,",",punct,",",False,False
3,also,also,ADV,RB,advmod,xxxx,True,True
4,known,know,VERB,VBN,acl,xxxx,True,False
...,...,...,...,...,...,...,...,...
277,to,to,ADP,IN,prep,xx,True,True
278,the,the,DET,DT,det,xxx,True,True
279,dark,dark,ADJ,JJ,amod,xxxx,True,False
280,side,side,NOUN,NN,pobj,xxxx,True,True


In [10]:
#displacy.render(doc, style="dep", jupyter=True)

# Stanford Open Information Extraction using CoreNLP

https://nlp.stanford.edu/software/openie.html

Extracting {entity1, relation, entity2} triples:

In [11]:
from openie import StanfordOpenIE

In [12]:
with StanfordOpenIE() as client:
    text = 'Darth Vader is also known by his birth name Anakin Skywalker.'
    print('Text: %s.' % text)
    
    for triple in client.annotate(text):
        print(triple)

Text: Darth Vader is also known by his birth name Anakin Skywalker..
Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-c22e1aaae0714918.props -preload openie
{'subject': 'Darth Vader', 'relation': 'is known by', 'object': 'his birth name Anakin Skywalker'}
{'subject': 'Darth Vader', 'relation': 'is', 'object': 'also known'}
{'subject': 'Darth Vader', 'relation': 'is', 'object': 'known'}
{'subject': 'Darth Vader', 'relation': 'is also known by', 'object': 'his birth name Anakin Skywalker'}


Trying longer document text:

In [13]:
triples = []
with StanfordOpenIE() as client:
    print('Text: %s.' % article)
    for triple in client.annotate(article):
        triples.append(triple)

Text: Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa,

In [14]:
triples[:3]

[{'subject': 'Darth Vader', 'relation': 'is', 'object': 'fictional character'},
 {'subject': 'fictional character',
  'relation': 'is in',
  'object': 'Star Wars franchise'},
 {'subject': 'Vader', 'relation': 'appears in', 'object': 'film trilogy'}]

In [15]:
print(f'Number of Entity-Relation triples: {len(triples)}')

Number of Entity-Relation triples: 89


# Encoding using BERT

https://explosion.ai/blog/spacy-transformers

In [16]:
import spacy
import en_trf_bertbaseuncased_lg

nlp = en_trf_bertbaseuncased_lg.load()

Use word-level embeddings from BERT to add context to tokens:

In [17]:
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")

In [18]:
print(apple1[0].similarity(apple2[0]))
print(apple1[0].similarity(apple3[0]))

0.7342852
0.433657


In [19]:
article_enc = nlp(article)
vader_example = nlp('A popular dialogue from Darth Vader is, Luke I am your father.')

In [20]:
article_enc.similarity(vader_example)

0.7885571923391387

In [21]:
test = ' '.join(x for x in triples[0].values())
article_enc.similarity(nlp(test))

0.7175568134427756

Check cosine similarity of each triple and original article and eliminate based on threshold:

In [22]:
triples_filtered = []
threshold = 0.50
for triple in triples:
    statement = ' '.join(x for x in triple.values())
    similarity = article_enc.similarity(nlp(statement))
    if similarity > threshold:
        triples_filtered.append(triple)
    print(triple, similarity)
    print()

{'subject': 'Darth Vader', 'relation': 'is', 'object': 'fictional character'} 0.7175568134427756

{'subject': 'fictional character', 'relation': 'is in', 'object': 'Star Wars franchise'} 0.6769007695427789

{'subject': 'Vader', 'relation': 'appears in', 'object': 'film trilogy'} 0.6853778819379464

{'subject': 'character', 'relation': 'was', 'object': 'created'} 0.4450941733865092

{'subject': 'Vader', 'relation': 'appears as', 'object': 'pivotal antagonist'} 0.6134172752533587

{'subject': '.123 Vader', 'relation': 'appears in', 'object': 'original film trilogy'} 0.7367025108877702

{'subject': '.123 Vader', 'relation': 'appears as', 'object': 'pivotal antagonist'} 0.6415943665749564

{'subject': 'character', 'relation': 'been portrayed by', 'object': 'numerous actors'} 0.5961205702483965

{'subject': 'Darth Vader', 'relation': 'also known by', 'object': 'his birth name Anakin Skywalker'} 0.7996625754653531

{'subject': 'Vader', 'relation': 'appears as', 'object': 'antagonist'} 0.6105

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations for greater good before', 'object': 'his fall side'} 0.7441029364951708

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations before', 'object': 'his fall side'} 0.716831684026373

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations before', 'object': 'his fall dark side'} 0.732535583795387

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations for good before', 'object': 'his fall dark side'} 0.7323033631627923

{'subject': 'He', 'relation': 'is', 'object': 'father'} 0.3904485663537688

{'subject': '100 Heroes Villains', 'relation': 'citing his original motivations for greater good before', 'object': 'his fall dark side'} 0.7531449367336372

{'subject': 'American Film Institute', 'relation': 'listed Hannibal Lecter as', 'object': 'greatest movie villain'} 0.7411301725763856

{'subject': '100 Heroes Villains', 'relation': 'citi

In [23]:
print(len(triples), len(triples_filtered))

89 73


# KG Creation & Exploration

In [24]:
import rdflib

In [25]:
starwars_graph = rdflib.Graph()

In [26]:
for triple in triples_filtered:
    starwars_graph.add((
        rdflib.Literal(triple['subject'], datatype=rdflib.namespace.XSD.string),
        rdflib.Literal(triple['relation'], datatype=rdflib.namespace.XSD.string),
        rdflib.Literal(triple['object'], datatype=rdflib.namespace.XSD.string)
    ))

In [27]:
for s, p, o in starwars_graph:
    print(s, '->', p, '->', o)

Darth Vader -> also known by -> his birth name Anakin Skywalker
his character -> is heavily referenced in -> Star Wars
Vader -> appears as -> pivotal antagonist
100 Heroes Villains -> citing his original motivations for greater good before -> his fall side
his past -> are central to -> narrative of prequel trilogy.
100 Heroes Villains -> citing his original motivations for good before -> his fall
He -> is also father of -> Luke Skywalker
Vader -> appears as -> antagonist
His appearances -> span -> six Star Wars films
100 Heroes Villains -> citing his motivations for greater good before -> his fall
100 Heroes Villains -> citing his original motivations before -> his fall dark side
100 Heroes Villains -> citing his original motivations for good before -> his fall side
100 Heroes Villains -> citing his motivations for greater good before -> his fall side
third greatest movie villain -> is in -> cinema history
He -> is father of -> Luke Skywalker
.123 Vader -> appears in -> original film t

Sample queries:

In [28]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='Darth Vader')
    }
"""

res = starwars_graph.query(query_str)
for s,p,o in res:
    print(s, '->', p, '->', o)

Darth Vader -> also known by -> his birth name Anakin Skywalker
Darth Vader -> is -> fictional character
Darth Vader -> known by -> his birth name Anakin Skywalker
Darth Vader -> is fictional character in -> Star Wars franchise


In [29]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='Vader')
    }
"""

res = starwars_graph.query(query_str)
for s,p,o in res:
    print(s, '->', p, '->', o)

Vader -> appears as -> pivotal antagonist
Vader -> appears as -> antagonist
Vader -> appears in -> film trilogy
Vader -> appears in -> original film trilogy


In [30]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='He')
    }
"""
res = starwars_graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

He -> is also father of -> Luke Skywalker
He -> is father of -> Luke Skywalker
He -> is also character in -> Star Wars
He -> is important character in -> Star Wars
He -> is character in -> Star Wars


### TODO: Need to "contexualize" key entities that group together "He" and "Darth Vader" for example

# starwars.ttl Exploration

In [31]:
graph = rdflib.Graph()
graph.parse('../data/starwars.ttl', format='turtle')

<Graph identifier=Nbdcd64e68f004f6ca1674bf5c1961216 (<class 'rdflib.graph.Graph'>)>

In [32]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
    }
    LIMIT 10
"""
res = starwars_graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

Darth Vader -> also known by -> his birth name Anakin Skywalker
his character -> is heavily referenced in -> Star Wars
Vader -> appears as -> pivotal antagonist
100 Heroes Villains -> citing his original motivations for greater good before -> his fall side
his past -> are central to -> narrative of prequel trilogy.
100 Heroes Villains -> citing his original motivations for good before -> his fall
He -> is also father of -> Luke Skywalker
Vader -> appears as -> antagonist
His appearances -> span -> six Star Wars films
100 Heroes Villains -> citing his motivations for greater good before -> his fall


# Parse articles from Wikipedia and construct KG

https://stackabuse.com/getting-started-with-pythons-wikipedia-api/

In [127]:
import wikipedia

In [133]:
wikipedia.search('Millennium Falcon')

['Millennium Falcon',
 'Star Wars: Millennium Falcon – Smugglers Run',
 'Millennium Falcon (novel)',
 'Chewbacca',
 'Falcon 9',
 'Berserk: Millennium Falcon Hen Seima Senki no Shō',
 'Lando Calrissian',
 'Solo: A Star Wars Story',
 'Han Solo',
 "Star Wars: Galaxy's Edge"]

In [134]:
wikipedia.summary('Millennium Falcon')

"The Millennium Falcon is a fictional starship in the Star Wars franchise. Designed by Joe Johnston for the movie Star Wars (1977), it has subsequently appeared in The Star Wars Holiday Special (1978), The Empire Strikes Back (1980), Return of the Jedi (1983), The Force Awakens (2015), The Last Jedi (2017), Solo: A Star Wars Story (2018), and The Rise of Skywalker (2019). The starship, or a similar one, also has a cameo in Revenge of the Sith (2005). Additionally, the Falcon appears in a variety of Star Wars expanded universe materials, including books, comics, and games; James Luceno's novel Millennium Falcon focuses on the titular ship. It also appears in the 2014 animated film The Lego Movie in Lego form, with Billy Dee Williams and Anthony Daniels reprising their roles of Lando Calrissian and C-3PO, and Keith Ferguson voicing Han Solo.\nSolo: A Star Wars Story depicts the YT-1300 Corellian light freighter being embodied by L3-37 (Phoebe Waller-Bridge). The ship is primarily command

In [136]:
wikipedia.page('Millennium Falcon').content

'The Millennium Falcon is a fictional starship in the Star Wars franchise. Designed by Joe Johnston for the movie Star Wars (1977), it has subsequently appeared in The Star Wars Holiday Special (1978), The Empire Strikes Back (1980), Return of the Jedi (1983), The Force Awakens (2015), The Last Jedi (2017), Solo: A Star Wars Story (2018), and The Rise of Skywalker (2019). The starship, or a similar one, also has a cameo in Revenge of the Sith (2005). Additionally, the Falcon appears in a variety of Star Wars expanded universe materials, including books, comics, and games; James Luceno\'s novel Millennium Falcon focuses on the titular ship. It also appears in the 2014 animated film The Lego Movie in Lego form, with Billy Dee Williams and Anthony Daniels reprising their roles of Lando Calrissian and C-3PO, and Keith Ferguson voicing Han Solo.\nSolo: A Star Wars Story depicts the YT-1300 Corellian light freighter being embodied by L3-37 (Phoebe Waller-Bridge). The ship is primarily comman

In [137]:
mfalcon_article = wikipedia.page('Millennium Falcon').content

In [146]:
def text_to_graph(article, similarity_threshold):
    # Convert to triples
    triples = []
    with StanfordOpenIE() as client:
        for triple in client.annotate(article):
            triples.append(triple)
    print(f'Num of Triples: {len(triples)}')
    
    # Load BERT
    nlp = en_trf_bertbaseuncased_lg.load()
    article_enc = nlp(article)
    
    # Similarity Thresholding
    triples_filtered = []
    for triple in triples:
        statement = ' '.join(x for x in triple.values())
        similarity = article_enc.similarity(nlp(statement))
        if similarity > similarity_threshold:
            triples_filtered.append(triple)
    print(f'Filtered Triples: {len(triples_filtered)}')
    
    # Need to add step here for contexualization
    
    # Create RDF graph
    graph = Graph()
    for triple in triples_filtered:
        graph.add((
            rdflib.Literal(triple['subject'], datatype=rdflib.namespace.XSD.string),
            rdflib.Literal(triple['relation'], datatype=rdflib.namespace.XSD.string),
            rdflib.Literal(triple['object'], datatype=rdflib.namespace.XSD.string)
        ))
    
    return graph
    

In [147]:
graph = text_to_graph(article=mfalcon_article, similarity_threshold=0.75)

Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-712f785371b34522.props -preload openie
Num of Triples: 1008
Filtered Triples: 65


In [149]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
    }
"""
res = graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

Star Wars Story -> depicts -> YT-1300 light freighter
original model -> used as -> Princess Leia 's ship
Chewbacca -> take possession after -> Solo 's hibernation
Hot Wheels playset.Lego -> has released -> multiple versions of Millennium Falcon
Calrissian captains -> is with -> Nien Nunb as co-pilot
Hot Wheels playset.Lego -> has released -> versions of Millennium Falcon
Lego -> released -> Millennium Falcon set in Star Wars Ultimate Collectors Series
it -> has subsequently appeared in -> Return of Jedi
Obi-Wan Kenobi -> charter ship In -> Star Wars
Chewbacca -> Back take possession after -> Solo 's hibernation by Empire
Creating Lando -> has -> Millennium Falcon for Solo Behind Magic of Kessel Run Millennium Falcon on Wookieepedia
James Luceno 's Millennium Falcon -> focuses on -> ship
Chewbacca -> Back take possession after -> Solo 's hibernation in carbonite by Empire
Star Wars Story -> depicts -> YT-1300 freighter
version -> appears -> briefly on Coruscant in Revenge of Sith
update