# Unsupervised Text to Ontology Creation

In [1]:
import text_to_graph as ttg
import numpy as np

In [2]:
text = 'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise. Darth Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy. The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi who was prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).'
text

'Darth Vader, also known by his birth name Anakin Skywalker, is a fictional character in the Star Wars franchise. Darth Vader appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy. The character was created by George Lucas and has been portrayed by numerous actors. His appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. He is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi who was prophesied to bring balance to the Force, he falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine (also known as Darth Sidious).'

### Named Entity Recognition

In [3]:
ner_dict = ttg.process_NER(text=text)
ner_dict

{'Darth Vader': 'PERSON',
 'Anakin Skywalker': 'PERSON',
 'George Lucas': 'PERSON',
 'first': 'ORDINAL',
 'six': 'CARDINAL',
 'Star Wars': 'WORK_OF_ART',
 'Star Wars: The Force Awakens': 'WORK_OF_ART',
 'Force': 'ORG',
 'Galactic Empire': 'ORG',
 'Sith': 'WORK_OF_ART',
 'Palpatine': 'PERSON',
 'Darth Sidious': 'PERSON'}

### Generate Coreferences and Dependencies

In [4]:
corefs = ttg.process_corefs(text=text, corenlp_path='./stanford-corenlp-4.2.0')
print("Coreferences found: ", len(corefs))

Coreferences found:  3


### Replacement with Named Entities and Dependencies

In [5]:
resolved_text = ttg.process_dependency_matching(text=text, ner_dict=ner_dict, corefs=corefs)
resolved_text

'Anakin Skywalker , is a fictional character in the Star Wars franchise. Anakin Skywalker appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while his past as Anakin Skywalker and the story of his corruption are central to the narrative of the prequel trilogy. Anakin Skywalker was created by George Lucas and has been portrayed by numerous actors. Anakin Skywalker appearances span the first six Star Wars films, as well as Rogue One, and his character is heavily referenced in Star Wars: The Force Awakens. Anakin Skywalker is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi who was prophesied to bring balance to the Force, Anakin Skywalker falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of his Sith master, Emperor Palpatine( also known as Darth Sidious) . '

### Add Neural Coref

In [6]:
resolved_text = ttg.process_neural_coref(resolved_text)
resolved_text

'Anakin Skywalker , is a fictional character in the Star Wars franchise. Anakin Skywalker appears in the original film trilogy as a pivotal antagonist whose actions drive the plot, while Anakin Skywalker past as Anakin Skywalker and the story of Anakin Skywalker corruption are central to the narrative of the original film trilogy. Anakin Skywalker was created by George Lucas and has been portrayed by numerous actors. Anakin Skywalker appearances span the first six Star Wars films, as well as Rogue One, and Anakin Skywalker character is heavily referenced in Star Wars: The Force Awakens. Anakin Skywalker is also an important character in the Star Wars expanded universe of television series, video games, novels, literature and comic books. Originally a Jedi who was prophesied to bring balance to the Force, Anakin Skywalker falls to the dark side of the Force and serves the evil Galactic Empire at the right hand of Anakin Skywalker Sith master, Emperor Palpatine( also known as Darth Sidio

### Perform NER to get Entities for Taxogen

In [7]:
ner_dict = ttg.process_NER(text=resolved_text)
entities = []
for entity in ner_dict.keys():
    entities.append(entity.replace(' ', '_'))
entities

['Anakin_Skywalker',
 'George_Lucas',
 'first',
 'six',
 'Star_Wars',
 'Star_Wars:_The_Force_Awakens',
 'Force',
 'Galactic_Empire',
 'Anakin_Skywalker_Sith',
 'Palpatine',
 'Darth_Sidious']

In [None]:
np.savetxt('./taxogen/data/starwars/raw/keywords.txt', entities, fmt='%s')
np.savetxt('./taxogen/data/starwars/raw/papers.txt', entities, fmt='%s')

### Generate Word Embeddings with Word2Vec

https://wikipedia2vec.github.io/wikipedia2vec/pretrained/

In [11]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('../word-embeddings/enwiki_20180420_300d.txt', binary=False)

In [19]:
# sentence = ['George_Lucas',
#  'first',
#  'six',
#  'Star_Wars',
#  'Star_Wars:_The_Force_Awakens',
#  'Force',
#  'Galactic_Empire',
#  'Anakin_Skywalker_Sith',
#  'Palpatine',
#  'Darth_Sidious']
# vectors = [model[w] for w in sentence]
# vectors

In [None]:
! ./taxogen/code/word2vec -train ./taxogen/data/starwars/raw/keywords.txt -output ./taxogen/data/starwars/input/embeddings.txt

In [None]:
! python taxogen/code/cluster-preprocess.py ./taxogen/data/starwars

In [None]:
! python taxogen/code/preprocess.py ./taxogen/data/starwars

In [None]:
! cp ./taxogen/data/starwars/input/embeddings.txt ./taxogen/data/starwars/init/embeddings.txt

In [None]:
! cp ./taxogen/data/starwars/input/keywords.txt ./taxogen/data/starwars/init/seed_keywords.txt

In [None]:
! python ./taxogen/code/main.py ./taxogen/data/starwars/