In [1]:
import spacy
from stanfordcorenlp import StanfordCoreNLP
import json
from collections import defaultdict
import nltk
import sys
sys.path.append('../../')
from text_to_graph import process_NER, process_corefs, process_dependency_matching
import pandas as pd
import spacy
import neuralcoref
nlp = spacy.load('en_core_web_lg')

In [2]:
neuralcoref.add_to_pipe(nlp)
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [3]:
doc = nlp('My sister has a dog. She loves him.')

print(doc._.has_coref)
print(doc._.coref_clusters)
print(doc._.coref_resolved)
doc = nlp(doc._.coref_resolved)
sentences = [sent.string.strip() for sent in doc.sents]
print(sentences)

True
[My sister: [My sister, She], a dog: [a dog, him]]
My sister has a dog. My sister loves a dog.
['My sister has a dog.', 'My sister loves a dog.']


In [4]:
doc = nlp(u'The cat and the dog sleep in the basket near the door.')
for np in doc.noun_chunks:
    print(np.text)

The cat
the dog
the basket
the door


In [5]:
raw_text = 'Hello, world. Here are two sentences.'
# nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]
len(sentences)

2

### Preprocess Unstructured Text Dataset

In [6]:
dataset = pd.read_csv('./dataset/starwars_text_dataset.txt', delimiter='\n', header=None, error_bad_lines=False)
dataset

b'Skipping line 341: expected 1 fields, saw 2\nSkipping line 1209: expected 1 fields, saw 2\nSkipping line 3309: expected 1 fields, saw 2\nSkipping line 3615: expected 1 fields, saw 2\nSkipping line 7258: expected 1 fields, saw 2\nSkipping line 8720: expected 1 fields, saw 2\nSkipping line 9514: expected 1 fields, saw 2\nSkipping line 11246: expected 1 fields, saw 2\nSkipping line 12019: expected 1 fields, saw 2\nSkipping line 13450: expected 1 fields, saw 2\nSkipping line 15793: expected 1 fields, saw 2\nSkipping line 16472: expected 1 fields, saw 2\nSkipping line 18440: expected 1 fields, saw 2\nSkipping line 20491: expected 1 fields, saw 2\nSkipping line 21737: expected 1 fields, saw 2\nSkipping line 23946: expected 1 fields, saw 2\nSkipping line 24387: expected 1 fields, saw 2\nSkipping line 24930: expected 1 fields, saw 2\nSkipping line 25723: expected 1 fields, saw 2\nSkipping line 26509: expected 1 fields, saw 2\nSkipping line 27150: expected 1 fields, saw 2\nSkipping line 27152

Unnamed: 0,0
0,Luke Skywalker is a fictional character and th...
1,"Portrayed by Mark Hamill, Luke first appeared ..."
2,"Three decades later, Hamill returned as Luke i..."
3,"The Last Jedi (2017), and The Rise of Skywalke..."
4,He reprised the role in The Mandalorian episod...
...,...
25145,References ==
25146,==
25147,External links ==
25148,Supreme Leader Snoke in the StarWars.com Databank


### Preprocessing with CoreNLP

In [None]:
text = ''
new_text = ''
coref_batch_size = 5
dataset_index = 0
new_dataset = []
keywords = {}
for index, row in dataset.iterrows():
    text += dataset.iloc[index][0]
    if index % coref_batch_size == 0:
        print('Completed Batch: {}'.format(index))
        # Perform Named Entity Recognition with spaCy
        ner_dict = process_NER(text=text)
#         print('***** Completed NER *****')

        # Generate Coreferences and Dependencies with CoreNLP
        corefs = process_corefs(text=text, corenlp_path='../../stanford-corenlp-4.2.0')
#         print("Coreferences found: ", len(corefs))

        # Perform Replacement with Named Entities and Dependencies
        resolved_text = process_dependency_matching(text=text, ner_dict=ner_dict, corefs=corefs)
#         print('***** Completed Coreference Resolution *****')
        #new_text += resolved_text
        
#         print(resolved_text)
        dataset_index = add_to_dataset(resolved_text, dataset_index)
        
        text = ''

### Neural Coref

In [None]:
text = ''
new_text = ''
coref_batch_size = 5
dataset_index = 0
new_dataset = []
keywords = {}
for index, row in dataset.iterrows():
    text += dataset.iloc[index][0]
    if index == 0:
        continue
        
    if index % coref_batch_size == 0:
        doc = nlp(text)
        resolved_text = doc._.coref_resolved
        dataset_index = add_to_dataset(resolved_text, dataset_index)
        text = ''
        print('Completed Batch: {}'.format(index))

> Grab collections of other sentences and documents for dataset

In [8]:
def add_to_dataset(resolved_text, dataset_index):
    doc = nlp(resolved_text)
    sentences = [sent.string.strip() for sent in doc.sents]
    for sentence in sentences:
        doc = nlp(sentence)
        # add to keywords
        for np in doc.noun_chunks:
            noun = text_format(np.text)
            if noun not in keywords:
                keywords[noun] = 1
            else:
                keywords[noun] += 1
            # replace keyword in sentence
            sentence = sentence.replace(np.text, noun)
        if len(sentence) < 10:
            continue
        with open("./dataset/starwars_text_dataset_cleanedaa.txt", "a") as file:
            file.write(sentence.lower()  + "\n")
#         new_dataset.append(sentence)
#         dataset.iloc[dataset_index][0] = sentence
        dataset_index += 1
#     print('Dataset Size: {}'.format(dataset_index))
    return dataset_index

In [9]:
def text_format(text):
    text = text.lower().replace(' ', '_')
    return text

In [None]:
sentences[7]

In [None]:
dataset[:10]

### Create Keywords

In [None]:
keywords = {}
for index, row in dataset.iterrows():
    doc = nlp(dataset.iloc[index][0])
    for np in doc.noun_chunks:
        text = text_format(np.text)
        if text not in keywords:
            keywords[text] = 1
        else:
            keywords[text] += 1

In [None]:
keywords = []
for index, row in dataset.iterrows():
    dataset.iloc[index][0] = dataset.iloc[index][0].lower()
    doc = nlp(dataset.iloc[index][0])
    for ent in doc.ents:
        orig_ent = str(ent)
        ent = orig_ent.replace(' ', '_')
        keywords.append(ent)
        dataset.iloc[index][0] = dataset.iloc[index][0].replace(orig_ent, ent)
    #print(dataset.iloc[index][0])

In [19]:
for key, value in keywords.items():
    if value < 100:
        continue
    with open("./dataset/starwars_keywords.txt", "a") as file:
        file.write(key  + "\n")

In [None]:
keywords

In [16]:
keywords = {k: v for k, v in sorted(keywords.items(), key=lambda item: item[1], reverse=True)}
keywords

{'who': 1526,
 'it': 1184,
 'star_wars': 1117,
 'the_film': 984,
 'lucas': 861,
 'luke': 807,
 'leia': 758,
 'the_jedi': 754,
 'the_force': 676,
 'he': 676,
 'i': 625,
 'the_character': 623,
 'palpatine': 547,
 'rey': 527,
 'the_mandalorian': 505,
 'the_sith': 504,
 'obi-wan': 489,
 'anakin': 487,
 'han': 440,
 'she': 437,
 'the_empire': 416,
 'they': 403,
 'darth_vader': 397,
 'the_clone_wars': 386,
 'return': 377,
 'revenge': 370,
 'finn': 343,
 'chewbacca': 339,
 'vader': 326,
 'ren': 309,
 'the_phantom_menace': 293,
 'grogu': 288,
 'the_dark_side': 277,
 'jabba': 276,
 'yoda': 263,
 'lucasfilm': 262,
 'what': 255,
 'luke_skywalker': 250,
 'you': 248,
 'skywalker': 244,
 'the_battle': 240,
 'the_series': 239,
 'ahsoka': 238,
 'the_first_order': 234,
 'the_rise': 232,
 'the_clones': 229,
 'the_last_jedi': 223,
 'attack': 219,
 'rose': 213,
 'george_lucas': 211,
 'him': 210,
 'c-3po': 206,
 'we': 206,
 'the_galaxy': 204,
 'the_events': 203,
 'the_resistance': 197,
 'the_role': 193,
 '