# Word2Vec

In [37]:
from __future__ import print_function, unicode_literals, division
import io
import bz2
import logging
from os import path
import os
import random
from pprint import pprint
from collections import defaultdict

import plac
try:
    import ujson as json
except ImportError:
    import json
from gensim.models import Word2Vec
from preshed.counter import PreshCounter
from spacy.strings import hash_string

logger = logging.getLogger(__name__)


class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, words):
        # Get counts for this document
        doc_counts = PreshCounter()
        doc_strings = {}
        for word in words:
            key = hash_string(word)
            doc_counts.inc(key, 1)
            doc_strings[key] = word
        n = 0
        for key, count in doc_counts:
            self.counts.inc(key, count)
            # TODO: Why doesn't inc return this? =/
            corpus_count = self.counts[key]
            # Remember the string when we exceed min count
            if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq:
                 self.strings[key] = doc_strings[key]
            n += count
        return n

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with io.open(text_loc, 'r', encoding='utf8') as file_:
                sent_strs = list(file_)
                random.shuffle(sent_strs)
                for sent_str in sent_strs:
                    yield sent_str.split()


def iter_dir(loc):
    for fn in os.listdir(loc):
        if path.isdir(path.join(loc, fn)):
            for sub in os.listdir(path.join(loc, fn)):
                yield path.join(loc, fn, sub)
        else:
            yield path.join(loc, fn)

@plac.annotations(
    in_dir=("Location of input directory"),
    out_loc=("Location of output file"),
    n_workers=("Number of workers", "option", "n", int),
    size=("Dimension of the word vectors", "option", "d", int),
    window=("Context window size", "option", "w", int),
    min_count=("Min count", "option", "m", int),
    negative=("Number of negative samples", "option", "g", int),
    nr_iter=("Number of iterations", "option", "i", int),
)
def main(in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative
    )
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        print("\tReading file: %s" % text_loc)
        with io.open(text_loc, 'r', encoding='utf8') as file_:
            text = file_.read()
        total_sents += text.count('\n')
        total_words += corpus.count_doc(text.split())  
        logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
                    text_no, total_words, len(corpus.strings))
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for key, string in corpus.strings.items():
        model.raw_vocab[string] = corpus.counts[key]
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus, total_examples=model.corpus_count, epochs=nr_iter)

    model.save(out_loc)


**min_count**: One of them is for pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them. A reasonable value for min_count is between 0-100, depending on the size of your dataset.

In [39]:
main(in_dir='../data/text/test', 
     out_loc='../embeddings/test/test_word2vec', 
     negative=5, 
     n_workers=8, 
     window=5, 
     size=128, 
     min_count=5, 
     nr_iter=2)

2017-11-25 05:25:40,838 : INFO : PROGRESS: at batch #0, processed 438 words, keeping 7 word types
2017-11-25 05:25:40,839 : INFO : PROGRESS: at batch #1, processed 712 words, keeping 9 word types
2017-11-25 05:25:40,842 : INFO : PROGRESS: at batch #2, processed 963 words, keeping 13 word types
2017-11-25 05:25:40,843 : INFO : Loading a fresh vocabulary
2017-11-25 05:25:40,843 : INFO : min_count=5 retains 13 unique words (100% of original 13, drops 0)
2017-11-25 05:25:40,844 : INFO : min_count=5 leaves 294 word corpus (100% of original 294, drops 0)
2017-11-25 05:25:40,844 : INFO : deleting the raw counts dictionary of 13 items
2017-11-25 05:25:40,845 : INFO : sample=1e-05 downsamples 13 most-common words
2017-11-25 05:25:40,845 : INFO : downsampling leaves estimated 3 word corpus (1.1% of prior 294)
2017-11-25 05:25:40,846 : INFO : estimated required memory for 13 words and 128 dimensions: 19812 bytes
2017-11-25 05:25:40,846 : INFO : resetting layer weights
2017-11-25 05:25:40,847 : IN

	Reading file: ../data/text/test/test_1.txt
	Reading file: ../data/text/test/test_2.txt
	Reading file: ../data/text/test/test_3.txt


In [45]:
new_model = gensim.models.Word2Vec.load('../embeddings/test/test_word2vec')
print("\nThere are %i vecabs" % len(new_model.wv.vocab))
pprint(new_model.wv.vocab)

print("\nSingle Vocab")
print(new_model.wv.vocab['game'])

print("\nKeras Layer")
print(new_model.wv.get_keras_embedding())

print("\nVector")
print(new_model.wv.word_vec(word='game', use_norm=False))


2017-11-25 05:29:16,556 : INFO : loading Word2Vec object from ../embeddings/test/test_word2vec
2017-11-25 05:29:16,561 : INFO : loading wv recursively from ../embeddings/test/test_word2vec.wv.* with mmap=None
2017-11-25 05:29:16,562 : INFO : setting ignored attribute syn0norm to None
2017-11-25 05:29:16,563 : INFO : setting ignored attribute cum_table to None
2017-11-25 05:29:16,566 : INFO : loaded ../embeddings/test/test_word2vec



There are 13 vecabs
{'I': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467da20>,
 'It': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467dba8>,
 'a': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467dac8>,
 'and': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467da90>,
 'game': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467d7b8>,
 'good': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467dc18>,
 'is': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467d908>,
 'it': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467d898>,
 'love': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467dcf8>,
 'so': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467db38>,
 'the': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467d978>,
 'this': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467dc88>,
 'to': <gensim.models.keyedvectors.Vocab object at 0x7f9d9467d9e8>}

Single Vocab
Vocab(count:41, index:1, sample_int:36677838)

Keras Layer
<keras.layers.

# Online training / Resuming training
Advanced users can load a model and continue training it with more sentences:

`model = gensim.models.Word2Vec.load('/tmp/mymodel')
model.train(more_sentences)`

You may need to tweak the total_words parameter to train(), depending on what learning rate decay you want to simulate.

Note that it’s not possible to resume training with models generated by the C tool, load_word2vec_format(). You can still use them for querying/similarity, but information vital for training (the vocab tree) is missing there.


In [46]:
# model = gensim.models.Word2Vec.load('/tmp/mymodel')
# model.train(more_sentences)

In [50]:
import keras
os.environ['KERAS_BACKEND']='tensorflow'
print(os.environ['KERAS_BACKEND'])

tensorflow


In [61]:
from subprocess import call, check_output
print(os.getcwd())
print(call(["pwd", "|"]))
print(check_output(['ls','-l']))


/home/jovyan/work/notebooks
0
b'total 48\n-rwxr-xr-x 1 root root 20571 Nov 12 14:54 Hierarchical Attention Network.ipynb\n-rwxr-xr-x 1 root root 10483 Nov 24 07:16 spacy-tutorial.ipynb\n-rwxr-xr-x 1 root root 11720 Nov 25 05:23 Word2Vec.ipynb\n'
