# Word2Vec: Training & Visualisation

## Imports

In [52]:
import re
import time
import glob
import logging
import spacy

from collections import defaultdict

from gensim.models.word2vec import Word2Vec
from gensim.models.word2vec import LineSentence

In [53]:
nlp = spacy.load('en')

In [101]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Corpus

In [27]:
corpus_path = "corpus/corpus_harrypotter/annotated.txt"
model_path = "results/model_harrypotter"

In [37]:
corpus = LineSentence(corpus_path)

# remove words that appear only once
frequency = defaultdict(int)
for text in corpus:
     for token in text:
        frequency[token] += 1

corpus = [[token for token in text if frequency[token] > 1]
         for text in corpus]

In [102]:
print (" ".join(corpus[0] + ["..."] + corpus[-1]))

harry|PERSON potter|PERSON and|CONJ the|DET sorcerer|NORP stone|PERSON chapter|NOUN one|CARDINAL the|DET boy|NOUN who|NOUN lived|VERB mr.|PROPN and|CONJ mrs.|PROPN dursley|PERSON of|ADP number|NOUN four|CARDINAL privet|PROPN drive|PROPN were|VERB proud|ADJ say|VERB that|ADP they|PRON were|VERB perfectly|ADV normal|ADJ thank|VERB you|PRON very|ADV much|ADV ... all|DET was|VERB well|ADV


## Training

In [98]:
# Now train a model!

# Parameters (see http://radimrehurek.com/gensim/models/word2vec.html):

# Degree of parallelization. Requires cython installed.
workers = 2

# Size of the neural network layer that corresponds
# to the size of the output vector. A bigger size
# requires more training data, but can lead to more
# accurate results.
size = 300

# How often a token must appear in the corpus.
min_count = 1

# The size of the window. It is the maximum distance
# between the current and predicted word within a sentence.
window = 5

# Epochs. How often do we iterate over the corpus?
# If you really want to train something, use 15-20 epochs
epochs = 30

# Defines the training algorithm. By default (sg=0), 
# CBOW is used. Otherwise (sg=1), skip-gram is employed.
skip_gram = 0

# If > 0, negative sampling will be used, the int for 
# negative specifies how many “noise words” should be drawn
# (usually between 5-20). Default is 5.
# If set to 0, no negative samping is used.
negative = 2

t = time.time()

model = Word2Vec(
    sentences=corpus,
    workers=workers,
    size=size,
    min_count=min_count,
    window=window,
    iter=epochs,
    sg=skip_gram,
    negative=negative
)

model.save(model_path)

print("Trained model in %0.2f seconds." % (time.time() - t))

2017-01-12 13:39:48,448 : INFO : collecting all words and their counts
2017-01-12 13:39:48,450 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-01-12 13:39:48,505 : INFO : PROGRESS: at sentence #10000, processed 126257 words, keeping 8938 word types
2017-01-12 13:39:48,546 : INFO : PROGRESS: at sentence #20000, processed 250501 words, keeping 12149 word types
2017-01-12 13:39:48,592 : INFO : PROGRESS: at sentence #30000, processed 380310 words, keeping 14518 word types
2017-01-12 13:39:48,639 : INFO : PROGRESS: at sentence #40000, processed 502565 words, keeping 16269 word types
2017-01-12 13:39:48,686 : INFO : PROGRESS: at sentence #50000, processed 634029 words, keeping 17691 word types
2017-01-12 13:39:48,732 : INFO : PROGRESS: at sentence #60000, processed 764250 words, keeping 18639 word types
2017-01-12 13:39:48,781 : INFO : PROGRESS: at sentence #70000, processed 894499 words, keeping 19341 word types
2017-01-12 13:39:48,826 : INFO : PROGRESS: at s

Trained model in 40.32 seconds.


## Application

In [107]:
def tag_word(word):
    for sentence in nlp(word).sents:
        for token in sentence:
            tag = token.ent_type_ or token.pos_
            return "{}|{}".format(token.text.lower(), tag)

def get_sim(word, tag):
    encoded = tag_word(word)
    for sim_word, sim_score in model.most_similar(positive=[encoded], topn=1000):
        if tag in sim_word:
            yield sim_word.split('|')[0], sim_score

### Similar POS

In [106]:
for name in ["Harry", "Hermione", "Ron", "Malfoy", "Snape", "Slughorn", "Ginny"]:
    print("{name} {verb} {adv} with {person}.".format(
        name=name,
        verb=list(get_sim(name, "VERB"))[0][0],
        adv=list(get_sim(name, "ADV"))[0][0],
        person=list(get_sim(name, "PERSON"))[0][0],
        adj=list(get_sim(name, "ADJ"))[0][0]))

Harry pulse merely with neville.
Hermione dreams hastily with ginny.
Ron gasp hastily with ginny.
Malfoy deranged berserk with goyle.
Snape retiring imploringly with umbridge.
Slughorn butt imploringly with umbridge.
Ginny crooning currently with wit.


### Characters Compared

In [114]:
characters = ["Harry", "Hermione", "Ron", "Malfoy", "Snape"]
attributes = ["happy", "nice", "evil", "tearful"]

for attribute in attributes:
    scores = {character: model.similarity(tag_word(character), tag_word(attribute))
              for character in characters}
    print()
    print(attribute)
    for i, (character, score) in enumerate(
            sorted(scores.items(), key=lambda x: x[1], reverse=True)):
        print("{}. {} {:.2f}".format(i+1, character, score))



happy
1. Snape 0.07
2. Hermione 0.05
3. Harry 0.01
4. Ron -0.02
5. Malfoy -0.04

nice
1. Hermione 0.01
2. Ron -0.06
3. Snape -0.07
4. Malfoy -0.10
5. Harry -0.18

evil
1. Snape 0.05
2. Hermione 0.04
3. Malfoy 0.03
4. Ron -0.01
5. Harry -0.05

tearful
1. Malfoy 0.05
2. Snape 0.04
3. Ron 0.02
4. Hermione -0.01
5. Harry -0.01
