In [1]:
import spacy
import pandas as pd
from spacy import displacy
import en_core_web_lg
from pathlib import Path

In [None]:
#! python -m spacy download en_core_web_lg

In [None]:
#! python -m spacy download en_trf_bertbaseuncased_lg

In [3]:
nlp = en_core_web_lg.load()

# Parsing & Part-of-Speech Tags

<b>Text</b>: The original word text.<br>
<b>Lemma</b>: The base form of the word.<br>
<b>POS</b>: The simple UPOS part-of-speech tag.<br>
<b>Tag</b>: The detailed part-of-speech tag.<br>
<b>Dep</b>: Syntactic dependency, i.e. the relation between tokens.<br>
<b>Shape</b>: The word shape – capitalization, punctuation, digits.<br>
<b>is_alpha</b>: Is the token an alpha character?<br>
<b>is_stop</b>: Is the token part of a stop list, i.e. the most common words of the language?<br>

In [None]:
doc = nlp('Darth Vader is also known by his birth name Anakin Skywalker.')
results = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is_alpha', 'is_stop'])

for token in doc:  
    results = results.append({'Text':token.text, 'Lemma':token.lemma_, 'POS':token.pos_, 'Tag':token.tag_, 'Dep':token.dep_, 'Shape':token.shape_, 'is_alpha':token.is_alpha, 'is_stop':token.is_stop}, ignore_index=True)
results

In [None]:
svg = displacy.render(doc, style="dep",jupyter=True)
#output_path = Path("dep.svg")
#output_path.open("w", encoding="utf-8").write(svg)

# Named Entities (NER)

<b>Text</b>: The original entity text.<br>
<b>Start</b>: Index of start of entity in the Doc.<br>
<b>End</b>: Index of end of entity in the Doc.<br>
<b>Label</b>: Entity label, i.e. type.<br>

In [2]:
#doc = nlp('Darth Vader is also known by his birth name Anakin Skywalker.')
doc = nlp('Darth Vader is also known by his birth name Skywalker.')
results = pd.DataFrame(columns=['Text', 'Start', 'End', 'Label'])

for ent in doc.ents:  
    results = results.append({'Text':ent.text, 'Start':ent.start_char, 'End':ent.end_char, 'Label':ent.label_}, ignore_index=True)
results

NameError: name 'nlp' is not defined

In [None]:
displacy.render(doc, style="ent")

# Larger Text Example

In [4]:
article = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secret husband of Padmé Amidala and grandfather of Kylo Ren., Darth Vader has become one of the most iconic villains in popular culture, and has been listed among the greatest villains and fictional characters ever.56 The American Film Institute listed him as the third greatest movie villain in cinema history on 100 Years... 100 Heroes and Villains, behind Hannibal Lecter and Norman Bates.7 However, other critics consider him a tragic hero, citing his original motivations for the greater good before his fall to the dark side.'
article

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise.123 Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy., The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).4 He is also the father of Luke Skywalker and Princess Leia Organa, secr

In [5]:
doc = nlp(article)
results = pd.DataFrame(columns=['Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is_alpha', 'is_stop'])

for token in doc:  
    results = results.append({'Text':token.text, 'Lemma':token.lemma_, 'POS':token.pos_, 'Tag':token.tag_, 'Dep':token.dep_, 'Shape':token.shape_, 'is_alpha':token.is_alpha, 'is_stop':token.is_stop}, ignore_index=True)
results

Unnamed: 0,Text,Lemma,POS,Tag,Dep,Shape,is_alpha,is_stop
0,Darth,Darth,PROPN,NNP,compound,Xxxxx,True,False
1,Vader,Vader,PROPN,NNP,nsubj,Xxxxx,True,False
2,",",",",PUNCT,",",punct,",",False,False
3,also,also,ADV,RB,advmod,xxxx,True,True
4,known,know,VERB,VBN,acl,xxxx,True,False
...,...,...,...,...,...,...,...,...
277,to,to,ADP,IN,prep,xx,True,True
278,the,the,DET,DT,det,xxx,True,True
279,dark,dark,ADJ,JJ,amod,xxxx,True,False
280,side,side,NOUN,NN,pobj,xxxx,True,True


In [None]:
#displacy.render(doc, style="dep", jupyter=True)

# Stanford Open Information Extraction using CoreNLP

https://nlp.stanford.edu/software/openie.html

Extracting {entity1, relation, entity2} triples:

In [None]:
from openie import StanfordOpenIE

In [None]:
with StanfordOpenIE() as client:
    text = 'Darth Vader is also known by his birth name Anakin Skywalker.'
    print('Text: %s.' % text)
    
    for triple in client.annotate(text):
        print(triple)

Trying longer document text:

In [None]:
triples = []
with StanfordOpenIE() as client:
    print('Text: %s.' % article)
    for triple in client.annotate(article):
        triples.append(triple)

In [None]:
triples[:3]

In [None]:
print(f'Number of Entity-Relation triples: {len(triples)}')

# Encoding using BERT

https://explosion.ai/blog/spacy-transformers

In [None]:
import spacy
import en_trf_bertbaseuncased_lg

nlp = en_trf_bertbaseuncased_lg.load()

Use word-level embeddings from BERT to add context to tokens:

In [None]:
apple1 = nlp("Apple shares rose on the news.")
apple2 = nlp("Apple sold fewer iPhones this quarter.")
apple3 = nlp("Apple pie is delicious.")
apple4 = nlp("Apple is a much better company than Microsoft.")

In [None]:
print(apple1[0].similarity(apple2[0]))
print(apple1[0].similarity(apple3[0]))
print(apple2[0].similarity(apple4[0]))

In [None]:
article_enc = nlp(article)
vader_example = nlp('A popular dialogue from Darth Vader is, Luke I am your father.')

In [None]:
article_enc.similarity(vader_example) # average word embedding

In [None]:
test = ' '.join(x for x in triples[0].values())
article_enc.similarity(nlp(test))

Check cosine similarity of each triple and original article and eliminate based on threshold:

In [None]:
triples_filtered = []
threshold = 0.50
for triple in triples:
    statement = ' '.join(x for x in triple.values())
    similarity = article_enc.similarity(nlp(statement))
    if similarity > threshold:
        triples_filtered.append(triple)
    print(triple, similarity)
    print()

In [None]:
print(len(triples), len(triples_filtered))

# KG Creation & Exploration

In [None]:
import rdflib

In [None]:
starwars_graph = rdflib.Graph()

In [None]:
for triple in triples_filtered:
    starwars_graph.add((
        rdflib.Literal(triple['subject'], datatype=rdflib.namespace.XSD.string),
        rdflib.Literal(triple['relation'], datatype=rdflib.namespace.XSD.string),
        rdflib.Literal(triple['object'], datatype=rdflib.namespace.XSD.string)
    ))

In [None]:
for s, p, o in starwars_graph:
    print(s, '->', p, '->', o)

Sample queries:

In [None]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='Darth Vader')
    }
"""

res = starwars_graph.query(query_str)
for s,p,o in res:
    print(s, '->', p, '->', o)

In [None]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='Vader')
    }
"""

res = starwars_graph.query(query_str)
for s,p,o in res:
    print(s, '->', p, '->', o)

In [None]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
        FILTER(?s='He')
    }
"""
res = starwars_graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

### TODO: Need to "contexualize" key entities that group together "He" and "Darth Vader" for example

# starwars.ttl Exploration

In [None]:
graph = rdflib.Graph()
graph.parse('../data/starwars.ttl', format='turtle')

In [None]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
    }
    LIMIT 10
"""
res = starwars_graph.query(query_str)

for s,p,o in res:
    print(s, '->', p, '->', o)

# Parse articles from Wikipedia and construct KG

https://stackabuse.com/getting-started-with-pythons-wikipedia-api/

In [None]:
import wikipedia

In [None]:
wikipedia.search('Millennium Falcon')

In [None]:
wikipedia.summary('Millennium Falcon')

In [None]:
wikipedia.page('Millennium Falcon').content

In [None]:
mfalcon_article = wikipedia.page('Millennium Falcon').content

In [None]:
def text_to_graph(article, similarity_threshold):
    # Convert to triples
    triples = []
    with StanfordOpenIE() as client:
        for triple in client.annotate(article):
            triples.append(triple)
    print(f'Num of Triples: {len(triples)}')
    
    # Load BERT
    nlp = en_trf_bertbaseuncased_lg.load()
    article_enc = nlp(article)
    
    # Similarity Thresholding
    triples_filtered = []
    for triple in triples:
        statement = ' '.join(x for x in triple.values())
        similarity = article_enc.similarity(nlp(statement))
        if similarity > similarity_threshold:
            triples_filtered.append(triple)
    print(f'Filtered Triples: {len(triples_filtered)}')
    
    # Need to add step here for contexualization
    
    # Create RDF graph
    graph = rdflib.Graph()
    for triple in triples_filtered:
        graph.add((
            rdflib.Literal(triple['subject'], datatype=rdflib.namespace.XSD.string),
            rdflib.Literal(triple['relation'], datatype=rdflib.namespace.XSD.string),
            rdflib.Literal(triple['object'], datatype=rdflib.namespace.XSD.string)
        ))
    
    return graph
    

In [None]:
graph = text_to_graph(article=article, similarity_threshold=0.50)

In [None]:
query_str = """
    SELECT ?s ?p ?o
    WHERE {   
        ?s ?p ?o.
    }
"""
res = graph.query(query_str)
heads, relations, tails = {}, {}, {}

index = 0
for s,p,o in res:
    heads[index] = str(s)
    relations[index] = str(p)
    tails[index] = str(o)
    index += 1
    #print(s, '->', p, '->', o)

## Node, Index pairs

In [None]:
heads, tails

## Populate term pairs for LexNet3: Evaluate hypernym relationship with every head

In [None]:
term_pairs = {}
for key1,_ in heads.items():
    for key2,_ in heads.items():
        pair = (heads[key1], heads[key2])
        term_pairs[pair] = 'false'    # false doesnt mean anything, satisfies code
print(len(term_pairs))

# Utilize LexNet3 for Hypernym Extraction

In [None]:
import tensorflow as tf
from LexNET3.lstm_common import vectorize_path, load_dataset, load_embeddings, get_paths
from LexNET3.knowledge_resource import KnowledgeResource
from LexNET3.paths_lstm_classifier_tf import PathLSTMClassifier

In [None]:
corpus_prefix = '/Users/rhythmsyed/Desktop/GTRI/entitylink/resource/wiki'
dataset_prefix = '/Users/rhythmsyed/Desktop/GTRI/entitylink/LexNET3/datasets/KHN'
model_prefix_file = '/Users/rhythmsyed/Desktop/GTRI/entitylink/models/customKH_model_checkpt/lstm_integrated_0.00_10'  

In [None]:
with open(dataset_prefix + '/relations.txt', 'r', encoding='utf-8') as f_in:
    relations = [line.strip() for line in f_in]
    relation_index = {relation: i for i, relation in enumerate(relations)}

In [None]:
# Load the datasets
print('Loading the dataset...')
test_set = load_dataset(dataset_prefix + '/test.tsv', relations)
y_test = [relation_index[label] for label in list(test_set.values())]
print('Done!')

In [None]:
# Load the resource (processed corpus)
print('Loading the corpus...')
corpus = KnowledgeResource(corpus_prefix)
print('Done!')

In [None]:
# Load Model
classifier, word_index, pos_index, dep_index, dir_index = PathLSTMClassifier.load_model(model_prefix_file)
print('Model Loaded!')

In [None]:
from collections import defaultdict
from itertools import count
def load_paths_and_word_vectors(corpus, dataset_keys, lemma_index):
    """
    Load the paths and the word vectors for this dataset
    :param corpus: the corpus object
    :param dataset_keys: the word pairs in the dataset
    :param word_index: the index of words for the word embeddings
    :return:
    """

    # Define the dictionaries
    pos_index = defaultdict(count(0).__next__)
    dep_index = defaultdict(count(0).__next__)
    dir_index = defaultdict(count(0).__next__)

    dummy = pos_index['#UNKNOWN#']
    dummy = dep_index['#UNKNOWN#']
    dummy = dir_index['#UNKNOWN#']

    # Vectorize tha paths (this calculates p_xy for the corpus
    # Note: vectorize path calls vectorize edge, which computes the edge
    keys = [(corpus.get_id_by_term(str.encode(x)), corpus.get_id_by_term(str.encode(y))) for (x, y) in dataset_keys]
    paths_x_to_y = [{vectorize_path(path, lemma_index, pos_index, dep_index, dir_index): count
                     for path, count in get_paths(corpus, x_id, y_id).items()}
                    for (x_id, y_id) in keys]
    paths = [{p: c for p, c in paths_x_to_y[i].items() if p is not None} for i in range(len(keys))]

    empty = [dataset_keys[i] for i, path_list in enumerate(paths) if len(list(path_list.keys())) == 0]
    print('Pairs without paths:', len(empty), ', all dataset:', len(dataset_keys))

    # Get the word embeddings for x and y (get a lemma index)
    print('Getting word vectors for the terms...')
    x_y_vectors = [(lemma_index.get(x, 0), lemma_index.get(y, 0)) for (x, y) in dataset_keys]

    pos_inverted_index = {i: p for p, i in pos_index.items()}
    dep_inverted_index = {i: p for p, i in dep_index.items()}
    dir_inverted_index = {i: p for p, i in dir_index.items()}

    print('Done loading corpus data!')

    return x_y_vectors, paths

In [None]:
# Load the paths and create the feature vectors
print('Loading path files...')
x_y_vectors_test, x_test = load_paths_and_word_vectors(corpus, list(term_pairs.keys()), word_index)

In [None]:
print('Evaluation:')
pred = classifier.predict(x_test, x_y_vectors=x_y_vectors_test)
len(pred)

## Check relation classification

In [None]:
classified = {}
res = dict((v,k) for k,v in relation_index.items())
for index in enumerate(term_pairs.items()):
    classified[index[1][0]] = res[pred[index[0]]]

In [None]:
for key, value in classified.items():
    if value != 'false':
        print(key, '->', value)