# End to end NER with an entity ruler and word vectors
1. Document cleaning and splitting the corpus into test and train sets
2. Build word vectors
3. Build training data with entity ruler and split into train and validation data
4. Add word vectors to model, run

## Notebook 2
- Load training data
- Process for word vectors
- Build word vectors and save word vector model

In [1]:
#import data
import json

#tokenize words
from gensim.parsing.preprocessing import split_on_space

#build n-grams
import gensim

#build word vectors
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import multiprocessing

### Load data

In [2]:
# import training data
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return(data)

In [3]:
files = load_data('data/sw_train_ner.json')

In [4]:
print(files[5:15])

['Corran asked', 'necessarily Luke realm possibility either', 'use dueling blasters ranged weapons', 'Usually Deputy Director led morning meetings', 'Wedge gave light slap arm', 'turned Leia', 'Yeah', 'finally caught', 'Kyp put shuttle Corran retrieved extra tool kit pulled shuttle left', '']


### Tokenize sentences

In [5]:
sentences = []
for sentence in files:
    sentence = split_on_space(sentence) #tokenizes words
    sentences.append(sentence)

In [6]:
print(sentences[5:15])

[['Corran', 'asked'], ['necessarily', 'Luke', 'realm', 'possibility', 'either'], ['use', 'dueling', 'blasters', 'ranged', 'weapons'], ['Usually', 'Deputy', 'Director', 'led', 'morning', 'meetings'], ['Wedge', 'gave', 'light', 'slap', 'arm'], ['turned', 'Leia'], ['Yeah'], ['finally', 'caught'], ['Kyp', 'put', 'shuttle', 'Corran', 'retrieved', 'extra', 'tool', 'kit', 'pulled', 'shuttle', 'left'], []]


In [7]:
#Create bigrams
#new brigrams becomes input in place of "final"
bigram_phrases = gensim.models.Phrases(sentences, min_count=5, threshold=50)
trigram_phrases = gensim.models.Phrases(bigram_phrases[sentences], min_count=5, threshold=50)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return([trigram[doc] for doc in texts])

data_bigrams = make_bigrams(sentences)
data_trigrams = make_trigrams(data_bigrams)

In [8]:
print(data_bigrams[0:4])

[['clearly', 'made', 'positive', 'impression', 'least', 'person', 'already'], ['yeah'], ['certainly', 'first'], ['asked', 'date', 'last', 'replacement', 'put', '3', 'ABY']]


In [9]:
def write_data(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f)
        
write_data('./data/training_bigrams.json', data_bigrams)

def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

sentences = load_data('./data/training_bigrams.json')

### Build word vectors

In [10]:
def training(model_name, corpus):
    cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=2, window=2, vector_size=500, sample=6e-5, alpha=.0009, min_alpha=.0007, 
                         negative=20, workers=cores-1, shrink_windows=True)
    w2v_model.build_vocab(corpus)
    w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=50)
    w2v_model.save(f'word_vectors/{model_name}.model')
    w2v_model.wv.save_word2vec_format(f'word_vectors/word2vec_{model_name}.txt')        

In [11]:
#train word vectors
training('sw_word_vec_3', sentences) 

### Use word vectors to review similarity
- Adjust model parameters if necessary, and rerun previous two cells

In [12]:
def gen_similarity(word):
    model = KeyedVectors.load_word2vec_format('./word_vectors/word2vec_sw_word_vec_3.txt', binary=False)
    results = model.most_similar(positive=[word])
    print(results)

In [13]:
gen_similarity('Luke')

[('Brianna', 0.9993738532066345), ('Wedge', 0.999233067035675), ('people', 0.9992311000823975), ('Jedi', 0.9991311430931091), ('Tycho', 0.9990862607955933), ('Corran', 0.9990518689155579), ('two', 0.9989820718765259), ('another', 0.9989610314369202), ('Iella', 0.9988853931427002), ('Force', 0.998860239982605)]


In [14]:
gen_similarity('Iella')

[('Brianna', 0.9989176392555237), ('Luke', 0.9988853931427002), ('Wedge', 0.9988267421722412), ('Jedi', 0.9987472891807556), ('Tycho', 0.9986521005630493), ('people', 0.998629093170166), ('Force', 0.9985208511352539), ('two', 0.9985008239746094), ('Corran', 0.9984903335571289), ('another', 0.9984268546104431)]


In [15]:
gen_similarity('Brianna')

[('Luke', 0.9993737936019897), ('Wedge', 0.9992231130599976), ('people', 0.999202311038971), ('Tycho', 0.9991704225540161), ('Jedi', 0.9991136789321899), ('Corran', 0.9990859627723694), ('two', 0.9989864826202393), ('another', 0.9989250302314758), ('Iella', 0.9989176988601685), ('small', 0.9988897442817688)]


In [16]:
gen_similarity('Jedi')

[('Luke', 0.9991313219070435), ('Brianna', 0.9991136193275452), ('Wedge', 0.9989767074584961), ('people', 0.998947024345398), ('Corran', 0.9988514184951782), ('Tycho', 0.9988108277320862), ('Iella', 0.9987474083900452), ('two', 0.99871426820755), ('small', 0.9986638426780701), ('another', 0.9986019730567932)]


In [17]:
gen_similarity('Force')

[('Luke', 0.9988601803779602), ('Wedge', 0.998847484588623), ('Brianna', 0.9987767934799194), ('people', 0.998731791973114), ('Tycho', 0.9986791610717773), ('two', 0.9985674619674683), ('Corran', 0.9985648393630981), ('Jedi', 0.9985617399215698), ('Iella', 0.9985208511352539), ('another', 0.998481273651123)]


In [18]:
gen_similarity('lightsaber')

[('Luke', 0.9984385371208191), ('Brianna', 0.9984233379364014), ('Tycho', 0.9982741475105286), ('Corran', 0.9982607364654541), ('people', 0.9981838464736938), ('Wedge', 0.9981691241264343), ('another', 0.9980785250663757), ('Jedi', 0.9980262517929077), ('Iella', 0.9979591965675354), ('training', 0.9979300498962402)]
