In [None]:
import spacy
import pandas as pd
import en_core_web_lg
from stanfordcorenlp import StanfordCoreNLP
import json
from collections import defaultdict
import nltk

<img src="diagram.png" style="height: 50px; width: 1000px;"/>

In [None]:
starwars_text = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise. Darth Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy. The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).'
starwars_text

> Note: 6 sentences

# SpaCy NER

In [None]:
nlp = en_core_web_lg.load()
doc = nlp(starwars_text)

ner_dict = {}
for x in doc.ents:
    ner_dict[x.text] = x.label_
ner_dict

# Stanford NER

> Note: Does not perform well at all compared to SpaCy NER. Able to recognize PERSONs but not in partial fragments. Not able to recognize entities other than LOCATION or PERSON such as WORK_OF_ART or DATE, etc

In [None]:
# # NLTK
# # sentences = nltk.sent_tokenize(starwars_text)

# spaCy
run_this=0
if run_this==1:
    nlp = spacy.lang.en.English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    doc = nlp(starwars_text)
    sentences = [sent.string.strip() for sent in doc.sents]

    ner_tagger = nltk.tag.StanfordNERTagger("./stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz", "./stanford-ner-2018-10-16/stanford-ner.jar")

    ner_dict = {}
    results = []

    nlp = spacy.lang.en.English()
    tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
    for sent in sentences:
        words = [token.orth_ for token in tokenizer(sent)]
        print(words)
        tagged = ner_tagger.tag(words)
        results += tagged

    for res in results:
        ner_dict[res[0]] = res[1]

In [None]:
ner_dict

# Coreference Resolution

> fixed root permission error: https://github.com/Lynten/stanford-corenlp/issues/26

### Generate coreferences and dependencies

In [None]:
nlp = StanfordCoreNLP("./stanford-corenlp-4.2.0", quiet=False)
annotated = nlp.annotate(starwars_text, properties={'annotators': 'coref', 'pipelineLanguage': 'en'})
result = json.loads(annotated)

### Resolve coreferences

In [None]:
corefs = result['corefs']
print("Coreferences found: ",len(corefs))
print("Named entities: " , ner_dict.keys())

In [None]:
replace_coref_with = []
sentence_wise_replacements = defaultdict(list) 
sentence_wise_replacements

In [None]:
sentences = nltk.sent_tokenize(starwars_text)
# nlp = spacy.lang.en.English()
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
# doc = nlp(starwars_text)
# sentences = [sent.string.strip() for sent in doc.sents]

print('Number of sentences: ', len(sentences))
print(sentences)

> Note: Here, nltk sentence tokenizer is more accurate than spaCy

In [None]:
for index,coreferences in enumerate(corefs.values()):
    replace_with = coreferences[0]
    for reference in coreferences:
        if reference["text"] in ner_dict.keys() or reference["text"][reference["headIndex"]-reference["startIndex"]] in ner_dict.keys():
            replace_with = reference
        sentence_wise_replacements[reference["sentNum"]-1].append((reference,index))
    replace_coref_with.append(replace_with["text"])  

sentence_wise_replacements[0].sort(key=lambda tup: tup[0]["startIndex"]) 

### Pronoun Replacement with Named Entity

In [None]:
# nlp = spacy.lang.en.English()
# tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
tokenizer = nltk.word_tokenize

In [None]:
#Carry out replacement
for index,sent in enumerate(sentences):
    replacement_list = sentence_wise_replacements[index]    
    for item in replacement_list[::-1]:                     
        to_replace = item[0]                                
        replace_with = replace_coref_with[item[1]]
        replaced_sent = ""
        words = tokenizer(sent)
        
        
        for i in range(len(words)-1,to_replace["endIndex"]-2,-1):
            replaced_sent = words[i] + " "+ replaced_sent
        
        replaced_sent = replace_with + " " + replaced_sent
        
        for i in range(to_replace["startIndex"]-2,-1,-1):
            replaced_sent = words[i] + " "+ replaced_sent
            
        sentences[index] = replaced_sent

result = ""
for sent in sentences:
    result += sent

### Original Text

In [None]:
starwars_text

### New Text

In [None]:
result

# Relation Extraction of using Stanford OpenIE

In [None]:
from openie import StanfordOpenIE

triples = []
with StanfordOpenIE() as client:
    for triple in client.annotate(result):
        triples.append(triple)
        
triples = pd.DataFrame(triples)
triples.head(20)

## TODO: Try out MinIE and/or Allenai OpenIE-standalone

# Named Entity and Triple Matching

In [None]:
entity_set = set(ner_dict.keys())
entity_set

In [None]:
final_triples = []
for row, col in triples.iterrows():
    col['subject'] = col['subject'].strip()

    if col['subject'] in entity_set:
        added = False
        entity2_sent = col['object']
        for entity in entity_set:
            if entity in entity2_sent:
                final_triples.append((ner_dict[col['subject']], col['subject'], col['relation'], ner_dict[entity], col['object']))
                added = True
        if not added:
            final_triples.append((ner_dict[col['subject']], col['subject'], col['relation'], 'O', col['object']))

In [None]:
final_triples

In [None]:
final_df = pd.DataFrame(final_triples, columns=['Type','Entity1','Relationship','Type', 'Entity2'])
final_df.to_csv('starwars1_processed.csv', encoding='utf-8', index=False)

> starwars1_processed Graph Visualization: https://graphcommons.com/graphs/01252887-9a23-4b33-a06d-02729a330beb

> Graph, using Stanford NER: https://graphcommons.com/graphs/1f5d76f7-69ad-4575-83dd-4c90afab7059

# Triple Linking

## Similarity with BERT

In [None]:
import en_trf_bertbaseuncased_lg
nlp = en_trf_bertbaseuncased_lg.load()

In [None]:
for _,col1 in final_df.iterrows():
    head = col1['Entity2']
    doc1 = nlp(head)
    
    for _,col2 in final_df.iterrows():
        tail = col2['Entity2']
        if head == tail:
            continue
            
        doc2 = nlp(tail)
        confidence = doc1.similarity(doc2)
        
        if confidence > 0.80:   # 80% seems to work pretty well
            # Perform logic for linking
            new_tail = tail if len(tail)<len(head) else head
            
            col1['Entity2'] = new_tail
            col2['Entity2'] = new_tail

            print("Sentence 1:", doc1)
            print("Sentence 2:", doc2)
            print("Similarity:", confidence)
            print(new_tail)
            print()

In [None]:
final_df = final_df.drop_duplicates()
final_df

In [None]:
final_df.to_csv('starwars1_linked_processed.csv', encoding='utf-8', index=False)

> https://graphcommons.com/graphs/aae63b3e-870d-4c2e-83e4-bbca9f297b42

# Textual Entailment with Keras_Parikh_Entailment

> https://github.com/explosion/spaCy/tree/master/examples/keras_parikh_entailment

In [None]:
import en_vectors_web_lg
nlp = en_vectors_web_lg.load()

In [None]:
final_triples

# Post Process & Entity Linking to existing Turtle Knowledge Bases

In [None]:
import rdflib
graph = rdflib.Graph()
graph.parse('../data/starwars.ttl', format='turtle')

In [None]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        #FILTER(?s="Darth Vader")
    }
    #LIMIT 10
"""
res = graph.query(query_str)

# for s,p,o in res:
#     print(s, '->', p, '->', o)