In [1]:
# -*- coding: utf-8 -*-

from gensim.models.word2vec import Word2Vec, LineSentence
import logging
import multiprocessing
import nltk
import os



In [2]:
DATA_DIR = "../data/comp_data"
FLASHCARD_SENTS = "studystack_qa_cleaner_no_qm.txt"
FLASHCARD_MODEL = "studystack.bin"
EMBED_SIZE = 300  # so we can reuse code using word2vec embeddings

logger = logging.getLogger("flashcards-embedding")
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s")
logging.root.setLevel(level=logging.DEBUG)

In [3]:
class FlashcardSentences(object):
    def __init__(self, filename):
        self.filename = filename
        
    def __iter__(self):
        for line in open(self.filename, "rb"):
            line = line.strip()
            line = line.decode("utf8").encode("ascii", "ignore")
            _, question, answer = line.split(b"\t")
            qwords = nltk.word_tokenize(question.decode('utf8'))
            awords = nltk.word_tokenize(answer.decode('utf8'))
            yield qwords + awords

In [4]:
# build model from sentences (CBOW w/negative sampling)
model = Word2Vec(size=EMBED_SIZE, window=5, min_count=5,
                 workers=multiprocessing.cpu_count())
sentences = FlashcardSentences(os.path.join(DATA_DIR, FLASHCARD_SENTS))

In [5]:
model.build_vocab(sentences)
sentences = FlashcardSentences(os.path.join(DATA_DIR, FLASHCARD_SENTS))

2018-06-15 11:18:46,761 : INFO : collecting all words and their counts
2018-06-15 11:18:47,039 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-15 11:18:50,252 : INFO : PROGRESS: at sentence #10000, processed 153114 words, keeping 16630 word types
2018-06-15 11:18:53,386 : INFO : PROGRESS: at sentence #20000, processed 313954 words, keeping 28792 word types
2018-06-15 11:18:56,700 : INFO : PROGRESS: at sentence #30000, processed 471520 words, keeping 35570 word types
2018-06-15 11:18:59,704 : INFO : PROGRESS: at sentence #40000, processed 639145 words, keeping 41333 word types
2018-06-15 11:19:02,665 : INFO : PROGRESS: at sentence #50000, processed 788895 words, keeping 46598 word types
2018-06-15 11:19:05,627 : INFO : PROGRESS: at sentence #60000, processed 940192 words, keeping 52072 word types
2018-06-15 11:19:08,926 : INFO : PROGRESS: at sentence #70000, processed 1120292 words, keeping 58345 word types
2018-06-15 11:19:11,916 : INFO : PROGRESS: at

In [6]:
model.train(sentences,total_examples=model.corpus_count,epochs=model.iter)
                 
model.init_sims(replace=True)

model.save(os.path.join(DATA_DIR, FLASHCARD_MODEL))

  if __name__ == '__main__':
2018-06-15 11:21:10,678 : INFO : training model with 4 workers on 40567 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-06-15 11:21:11,823 : INFO : EPOCH 1 - PROGRESS: at 0.40% examples, 20459 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:12,929 : INFO : EPOCH 1 - PROGRESS: at 1.16% examples, 26943 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:14,035 : INFO : EPOCH 1 - PROGRESS: at 1.87% examples, 29075 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:15,123 : INFO : EPOCH 1 - PROGRESS: at 2.58% examples, 30249 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:16,168 : INFO : EPOCH 1 - PROGRESS: at 3.32% examples, 31175 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:17,273 : INFO : EPOCH 1 - PROGRESS: at 4.04% examples, 32574 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:18,380 : INFO : EPOCH 1 - PROGRESS: at 4.74% examples, 32690 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:21:19,417 : INFO : EPOCH 1 - P

2018-06-15 11:22:28,962 : INFO : EPOCH 1 - PROGRESS: at 52.94% examples, 34784 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:30,029 : INFO : EPOCH 1 - PROGRESS: at 53.74% examples, 34882 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:31,248 : INFO : EPOCH 1 - PROGRESS: at 54.33% examples, 34721 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:32,461 : INFO : EPOCH 1 - PROGRESS: at 55.08% examples, 34661 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:33,543 : INFO : EPOCH 1 - PROGRESS: at 55.39% examples, 34389 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:34,556 : INFO : EPOCH 1 - PROGRESS: at 56.13% examples, 34413 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:35,604 : INFO : EPOCH 1 - PROGRESS: at 57.06% examples, 34505 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:36,660 : INFO : EPOCH 1 - PROGRESS: at 57.93% examples, 34592 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:22:37,723 : INFO : EPOCH 1 - PROGRESS: at 58.67% examples, 34686 words/s, in_qsize 0, out_

2018-06-15 11:23:41,563 : INFO : EPOCH 2 - PROGRESS: at 3.17% examples, 35955 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:42,678 : INFO : EPOCH 2 - PROGRESS: at 3.91% examples, 36621 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:43,761 : INFO : EPOCH 2 - PROGRESS: at 4.74% examples, 37305 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:44,842 : INFO : EPOCH 2 - PROGRESS: at 5.55% examples, 37846 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:46,025 : INFO : EPOCH 2 - PROGRESS: at 6.43% examples, 37711 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:47,109 : INFO : EPOCH 2 - PROGRESS: at 7.17% examples, 37995 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:48,172 : INFO : EPOCH 2 - PROGRESS: at 7.97% examples, 38301 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:49,303 : INFO : EPOCH 2 - PROGRESS: at 8.80% examples, 38387 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:23:50,368 : INFO : EPOCH 2 - PROGRESS: at 9.53% examples, 38091 words/s, in_qsize 0, out_qsize 0
2

2018-06-15 11:25:01,256 : INFO : EPOCH 2 - PROGRESS: at 67.39% examples, 41790 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:02,294 : INFO : EPOCH 2 - PROGRESS: at 68.20% examples, 41714 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:03,390 : INFO : EPOCH 2 - PROGRESS: at 69.11% examples, 41698 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:04,577 : INFO : EPOCH 2 - PROGRESS: at 70.15% examples, 41724 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:05,640 : INFO : EPOCH 2 - PROGRESS: at 71.12% examples, 41799 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:06,727 : INFO : EPOCH 2 - PROGRESS: at 72.07% examples, 41865 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:07,812 : INFO : EPOCH 2 - PROGRESS: at 73.02% examples, 41935 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:08,881 : INFO : EPOCH 2 - PROGRESS: at 73.98% examples, 42007 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:25:09,935 : INFO : EPOCH 2 - PROGRESS: at 74.90% examples, 42091 words/s, in_qsize 0, out_

2018-06-15 11:26:13,332 : INFO : EPOCH 3 - PROGRESS: at 31.27% examples, 44565 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:14,382 : INFO : EPOCH 3 - PROGRESS: at 32.23% examples, 44704 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:15,428 : INFO : EPOCH 3 - PROGRESS: at 33.15% examples, 44837 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:16,449 : INFO : EPOCH 3 - PROGRESS: at 34.07% examples, 44803 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:17,584 : INFO : EPOCH 3 - PROGRESS: at 35.10% examples, 44843 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:18,723 : INFO : EPOCH 3 - PROGRESS: at 36.18% examples, 44873 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:19,736 : INFO : EPOCH 3 - PROGRESS: at 37.07% examples, 44855 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:20,761 : INFO : EPOCH 3 - PROGRESS: at 38.01% examples, 44837 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:26:21,844 : INFO : EPOCH 3 - PROGRESS: at 39.06% examples, 44918 words/s, in_qsize 0, out_

2018-06-15 11:27:32,907 : DEBUG : job loop exiting, total 707 jobs
2018-06-15 11:27:32,911 : DEBUG : worker exiting, processed 176 jobs
2018-06-15 11:27:32,911 : DEBUG : worker exiting, processed 177 jobs
2018-06-15 11:27:32,912 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-15 11:27:32,918 : DEBUG : worker exiting, processed 177 jobs
2018-06-15 11:27:32,931 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-15 11:27:32,935 : DEBUG : worker exiting, processed 177 jobs
2018-06-15 11:27:32,937 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-15 11:27:32,943 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-15 11:27:32,944 : INFO : EPOCH - 3 : training on 7053621 raw words (5228272 effective words) took 115.6s, 45226 effective words/s
2018-06-15 11:27:34,027 : INFO : EPOCH 4 - PROGRESS: at 1.01% examples, 47645 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:27:35,139 : INFO : EPOCH 4 - P

2018-06-15 11:28:46,471 : INFO : EPOCH 4 - PROGRESS: at 57.22% examples, 39940 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:47,635 : INFO : EPOCH 4 - PROGRESS: at 57.93% examples, 39808 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:48,751 : INFO : EPOCH 4 - PROGRESS: at 58.56% examples, 39714 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:49,888 : INFO : EPOCH 4 - PROGRESS: at 59.19% examples, 39612 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:50,990 : INFO : EPOCH 4 - PROGRESS: at 60.04% examples, 39621 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:51,994 : INFO : EPOCH 4 - PROGRESS: at 60.74% examples, 39682 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:53,167 : INFO : EPOCH 4 - PROGRESS: at 61.55% examples, 39663 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:54,338 : INFO : EPOCH 4 - PROGRESS: at 62.24% examples, 39640 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:28:55,447 : INFO : EPOCH 4 - PROGRESS: at 63.01% examples, 39643 words/s, in_qsize 0, out_

2018-06-15 11:30:00,367 : INFO : EPOCH 5 - PROGRESS: at 7.45% examples, 35821 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:01,536 : INFO : EPOCH 5 - PROGRESS: at 8.25% examples, 35989 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:02,600 : INFO : EPOCH 5 - PROGRESS: at 9.08% examples, 36454 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:03,637 : INFO : EPOCH 5 - PROGRESS: at 9.86% examples, 36429 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:04,708 : INFO : EPOCH 5 - PROGRESS: at 10.59% examples, 36307 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:05,800 : INFO : EPOCH 5 - PROGRESS: at 11.31% examples, 36132 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:06,874 : INFO : EPOCH 5 - PROGRESS: at 12.02% examples, 36064 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:08,048 : INFO : EPOCH 5 - PROGRESS: at 12.87% examples, 36187 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:30:09,113 : INFO : EPOCH 5 - PROGRESS: at 13.74% examples, 36488 words/s, in_qsize 0, out_qsiz

2018-06-15 11:31:23,117 : INFO : EPOCH 5 - PROGRESS: at 65.29% examples, 36256 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:24,137 : INFO : EPOCH 5 - PROGRESS: at 66.07% examples, 36328 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:25,196 : INFO : EPOCH 5 - PROGRESS: at 66.82% examples, 36387 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:26,355 : INFO : EPOCH 5 - PROGRESS: at 67.71% examples, 36406 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:27,378 : INFO : EPOCH 5 - PROGRESS: at 68.36% examples, 36328 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:28,401 : INFO : EPOCH 5 - PROGRESS: at 69.11% examples, 36327 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:29,548 : INFO : EPOCH 5 - PROGRESS: at 70.01% examples, 36354 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:30,553 : INFO : EPOCH 5 - PROGRESS: at 70.72% examples, 36353 words/s, in_qsize 0, out_qsize 0
2018-06-15 11:31:31,602 : INFO : EPOCH 5 - PROGRESS: at 71.39% examples, 36336 words/s, in_qsize 0, out_

In [7]:
# test model
model = Word2Vec.load(os.path.join(DATA_DIR, FLASHCARD_MODEL))
print(model.similarity("man", "woman"), model.similarity("cat", "rock"))
print(model.most_similar("exercise"))

2018-06-15 11:32:13,452 : INFO : loading Word2Vec object from ../data/comp_data\studystack.bin
2018-06-15 11:32:13,454 : DEBUG : {'kw': {}, 'uri': '../data/comp_data\\studystack.bin', 'mode': 'rb'}
2018-06-15 11:32:13,456 : DEBUG : encoding_wrapper: {'errors': 'strict', 'encoding': None, 'fileobj': <_io.BufferedReader name='../data/comp_data\\studystack.bin'>, 'mode': 'rb'}
2018-06-15 11:32:13,633 : INFO : loading vocabulary recursively from ../data/comp_data\studystack.bin.vocabulary.* with mmap=None
2018-06-15 11:32:13,634 : INFO : loading trainables recursively from ../data/comp_data\studystack.bin.trainables.* with mmap=None
2018-06-15 11:32:13,635 : INFO : loading syn1neg from ../data/comp_data\studystack.bin.trainables.syn1neg.npy with mmap=None
2018-06-15 11:32:13,677 : INFO : loading wv recursively from ../data/comp_data\studystack.bin.wv.* with mmap=None
2018-06-15 11:32:13,679 : INFO : loading vectors from ../data/comp_data\studystack.bin.wv.vectors.npy with mmap=None
2018-06

0.6837608749040395 -0.009654290710434115
[('sleep', 0.6886226534843445), ('cessation', 0.6727858185768127), ('smoking', 0.6664212346076965), ('appetite', 0.6603558659553528), ('monitoring', 0.6591395139694214), ('pregnancy', 0.6494423151016235), ('minimal', 0.6444201469421387), ('infusion', 0.6431697607040405), ('BP', 0.6409623622894287), ('tolerance', 0.6312829256057739)]
