In [1]:
import spacy
import pandas as pd
import en_core_web_lg
from stanfordcorenlp import StanfordCoreNLP
import json
from collections import defaultdict
import nltk

<img src="diagram.png" style="height: 50px; width: 1000px;"/>

In [2]:
starwars_text = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).'
starwars_text

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).'

# Spacy NER

In [3]:
nlp = en_core_web_lg.load()
doc = nlp(starwars_text)

ner_dict = {}
for x in doc.ents:
    ner_dict[x.text] = x.label_
ner_dict

{'Darth Vader': 'PERSON',
 'Anakin Skywalker': 'PERSON',
 'the Star Wars': 'WORK_OF_ART',
 'franchise.123 Vader': 'PERSON',
 'George Lucas': 'PERSON',
 'the first': 'DATE',
 'six': 'CARDINAL',
 'Star Wars': 'WORK_OF_ART',
 'Rogue One': 'PRODUCT',
 'Jedi': 'PERSON',
 'Force': 'ORG',
 'Sith': 'PRODUCT',
 'Palpatine': 'PERSON',
 'Darth Sidious': 'PERSON'}

# Stanford NER

In [4]:
# # NLTK
# # sentences = nltk.sent_tokenize(starwars_text)

# # spaCy
# nlp = spacy.lang.en.English()
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
# doc = nlp(starwars_text)
# sentences = [sent.string.strip() for sent in doc.sents]

# ner_tagger = nltk.tag.StanfordNERTagger("./stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz", "./stanford-ner-2018-10-16/stanford-ner.jar")

# ner_dict = {}
# results = []
# for sent in sentences:
#     words = nltk.word_tokenize(sent)
#     tagged = ner_tagger.tag(words)
#     results += tagged

# for res in results:
#     ner_dict[res[0]] = res[1]

# Coreference Resolution

> fixed root permission error: https://github.com/Lynten/stanford-corenlp/issues/26

### Generate coreferences

In [5]:
nlp = StanfordCoreNLP("./stanford-corenlp-4.2.0", quiet=False)
annotated = nlp.annotate(starwars_text, properties={'annotators': 'coref', 'pipelineLanguage': 'en'})
result = json.loads(annotated)

### Resolve coreferences

In [6]:
corefs = result['corefs']
print("Coreferences found: ",len(corefs))
print("Named entities: " , ner_dict.keys())

Coreferences found:  3
Named entities:  dict_keys(['Darth Vader', 'Anakin Skywalker', 'the Star Wars', 'franchise.123 Vader', 'George Lucas', 'the first', 'six', 'Star Wars', 'Rogue One', 'Jedi', 'Force', 'Sith', 'Palpatine', 'Darth Sidious'])


In [7]:
replace_coref_with = []
sentence_wise_replacements = defaultdict(list) 
sentence_wise_replacements

defaultdict(list, {})

In [8]:
sentences = nltk.sent_tokenize(starwars_text)
# nlp = spacy.lang.en.English()
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
# doc = nlp(starwars_text)
# sentences = [sent.string.strip() for sent in doc.sents]

print('Number of sentences: ', len(sentences))
print(sentences)

Number of sentences:  4
['Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors.', 'His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens.', 'He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books.', 'Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).']


In [9]:
for index,coreferences in enumerate(corefs.values()):
    replace_with = coreferences[0]
    for reference in coreferences:
        if reference["text"] in ner_dict.keys() or reference["text"][reference["headIndex"]-reference["startIndex"]] in ner_dict.keys():
            replace_with = reference
        sentence_wise_replacements[reference["sentNum"]-1].append((reference,index))
    replace_coref_with.append(replace_with["text"])  

sentence_wise_replacements[0].sort(key=lambda tup: tup[0]["startIndex"]) 

### Replacement

In [10]:
# nlp = spacy.lang.en.English()
# tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
tokenizer = nltk.word_tokenize

# print(list(tokenizer(sentences[0])))
# print(nltk.word_tokenize(sentences[0]))

In [11]:
#Carry out replacement
for index,sent in enumerate(sentences):
    replacement_list = sentence_wise_replacements[index]    # replacement_list : [({},int)]
    for item in replacement_list[::-1]:                     # item : ({},int)
        to_replace = item[0]                                # to_replace: {}
        replace_with = replace_coref_with[item[1]]
        replaced_sent = ""
        words = tokenizer(sent)
        
        
        for i in range(len(words)-1,to_replace["endIndex"]-2,-1):
            replaced_sent = words[i] + " "+ replaced_sent
        
        replaced_sent = replace_with + " " + replaced_sent
        
        for i in range(to_replace["startIndex"]-2,-1,-1):
            replaced_sent = words[i] + " "+ replaced_sent
            
        sentences[index] = replaced_sent

result = ""
for sent in sentences:
    result += sent

### Original Text

In [12]:
starwars_text

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).'

### New Text

In [13]:
result

'Anakin Skywalker , is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot , while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy. , The character was created by George Lucas and has been portrayed by numerous actors . Anakin Skywalker appearances span the first six Star Wars films , as well as Rogue One , and his character is heavily referenced in Star Wars : The Force Awakens . Anakin Skywalker is also an important character in the Star Wars expanded universe of television series , video games , novels , literature and comic books . Originally a Jedi prophesied to bring balance to the Force , he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master , Emperor Palpatine ( also known as Darth Sidious ) . '

# Relation Extraction of using Stanford OpenIE

In [14]:
from openie import StanfordOpenIE

triples = []
with StanfordOpenIE() as client:
    for triple in client.annotate(result):
        triples.append(triple)
        
triples = pd.DataFrame(triples)
triples.head(20)

Starting server with command: java -Xmx8G -cp /Users/rhythmsyed/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-85101ce922324d7f.props -preload openie


Unnamed: 0,subject,relation,object
0,Vader,appears in,film trilogy
1,.123 Vader,appears as,pivotal antagonist
2,his past,are,while central to narrative
3,Vader,appears as,antagonist
4,.123 Vader,appears,central
5,his past,are central to,narrative of prequel trilogy
6,Vader,appears as,pivotal antagonist
7,.123 Vader,appears in,original film trilogy
8,.123 Vader,appears in,film trilogy
9,his past,are central to,narrative


# Named Entity and Triple Matching

In [15]:
entity_set = set(ner_dict.keys())
entity_set

{'Anakin Skywalker',
 'Darth Sidious',
 'Darth Vader',
 'Force',
 'George Lucas',
 'Jedi',
 'Palpatine',
 'Rogue One',
 'Sith',
 'Star Wars',
 'franchise.123 Vader',
 'six',
 'the Star Wars',
 'the first'}

In [16]:
final_triples = []
for row, col in triples.iterrows():
    col['subject'] = col['subject'].strip()
    
    if col['subject'] in entity_set:
        added = False
        entity2_sent = col['object'].split(' ')
        for entity in entity2_sent:
            if entity in entity_set:
                final_triples.append((ner_dict[col['subject']], col['subject'], col['relation'], ner_dict[entity], col['object']))
                added = True
        if not added:
            final_triples.append((ner_dict[col['subject']], col['subject'], col['relation'], 'O', col['object']))

In [17]:
final_triples

[('PERSON', 'Anakin Skywalker', 'is', 'O', 'fictional character'),
 ('PRODUCT', 'Rogue One', 'is', 'O', 'heavily referenced'),
 ('PRODUCT', 'Rogue One', 'is', 'O', 'heavily referenced in Star Wars'),
 ('PRODUCT', 'Rogue One', 'is', 'O', 'referenced'),
 ('PRODUCT', 'Rogue One', 'is', 'O', 'referenced in Star Wars'),
 ('PERSON', 'Anakin Skywalker', 'is', 'O', 'also important character'),
 ('PERSON', 'Anakin Skywalker', 'is character in', 'O', 'Star Wars'),
 ('PERSON', 'Anakin Skywalker', 'is', 'O', 'character'),
 ('PERSON', 'Anakin Skywalker', 'is', 'O', 'also character'),
 ('PERSON', 'Anakin Skywalker', 'is', 'O', 'important character'),
 ('PERSON', 'Anakin Skywalker', 'is important character in', 'O', 'Star Wars'),
 ('PERSON',
  'Anakin Skywalker',
  'is also important character in',
  'O',
  'Star Wars'),
 ('PERSON', 'Anakin Skywalker', 'is also character in', 'O', 'Star Wars'),
 ('PERSON', 'Jedi', 'bring', 'O', 'balance'),
 ('PERSON', 'Jedi', 'bring balance to', 'ORG', 'Force')]

In [18]:
final_df = pd.DataFrame(final_triples, columns=['Type','Entity 1','Relationship','Type', 'Entity2'])
final_df.to_csv('starwars_StanfordNER_processed.csv', encoding='utf-8', index=False)

> Graph Visualization: https://graphcommons.com/graphs/f1d02e69-3b22-4aea-a981-f8b56dfc7323

> Graph, using Stanford NER: https://graphcommons.com/graphs/1f5d76f7-69ad-4575-83dd-4c90afab7059

# Post Process & Entity Linking to existing Turtle Knowledge Bases

In [19]:
import rdflib
graph = rdflib.Graph()
graph.parse('../data/starwars.ttl', format='turtle')

<Graph identifier=Ne5d13df5d0ce4f66ac0150dc5e74d199 (<class 'rdflib.graph.Graph'>)>

In [22]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        #FILTER(?s="Darth Vader")
    }
    #LIMIT 10
"""
res = graph.query(query_str)

# for s,p,o in res:
#     print(s, '->', p, '->', o)