In [36]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

In [2]:
import warnings; warnings.simplefilter('ignore')
import gensim as gen
import json
import codecs
import bleach
import re
import os.path
import sys
from optparse import OptionParser

import local_settings
import django
django.setup()

from sefaria.model import *



In [3]:
stopwords = codecs.open('./hebrew_stopwords.txt', encoding='utf8').read().strip().split('\n')
stopwords_regex = u"(?:\s|^)({})(?=\s|$)".format(u"|".join(stopwords))
stopwords_regex = re.compile(stopwords_regex)

In [4]:
def remove_stopwords(data):
    data = remove_dicta_prefix(data)   
    return re.sub(stopwords_regex, u' ', data)

def remove_dicta_prefix(data):
    data = re.sub(ur'[\u05d0-\u05ea]+┉', u'', data)
    return data

def remove_punctuation(data):
    data = re.sub(ur'־', u' ', data)
    data = re.sub(ur'\([^)]+\)', u' ', data)
    data = re.sub(ur'<[^>]+>', u' ', data)
    data = re.sub(ur'\[[^\]]+\]', u' ', data)
    data = re.sub(ur'[^ \u05d0-\u05ea"\'״׳]', u' ', data)
    data = re.sub(ur'(^|\s)["\'״׳]+', u' ', data)
    data = re.sub(ur'["\'״׳]+(\s|$)', u' ', data)
    return data

def get_segments(filename):
    """Lazy function (generator) to read a file line by line"""
    all_data = []
    i = 0
    for line in codecs.open(filename, encoding='utf8'):
        data = line.strip()
        i += 1
        if u'~~' not in data:
            continue
        data = data.split(u'~~')[1]
        data = remove_stopwords(data)
        data = remove_punctuation(data)
        all_data.append(data)
        if i%10000 == 0:
            print i
    return all_data

In [5]:
segments = get_segments('./sefaria-export_prefix_refs.txt')

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
1390000
1400000
1410

Initiate a SegmentGenerator object to pass to the model

In [None]:
class SegmentGenerator(object):
    def __init__(self, segments):
        self.segments = segments

    def __iter__(self):
        for data in self.segments:
            yield data.split()

In [107]:
segments_generator = SegmentGenerator2(segments)

### Model intiation

In [130]:
model = gen.models.Word2Vec(size=100, window=5)

Now we have initiated the Word2Vec model. The next step is to call the **build_vocab** method for the preliminary scan of the text. Call the method and print how many words are in the vocabulary of our text

In [131]:
model.build_vocab(segments_generator)
print("Number of words in vocabulary: {}".format(len(model.wv.vocab)))

INFO collecting all words and their counts
INFO PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO PROGRESS: at sentence #10000, processed 580519 words, keeping 43834 word types
INFO PROGRESS: at sentence #20000, processed 1135080 words, keeping 61688 word types
INFO PROGRESS: at sentence #30000, processed 1696331 words, keeping 74536 word types
INFO PROGRESS: at sentence #40000, processed 2296109 words, keeping 85524 word types
INFO PROGRESS: at sentence #50000, processed 2858300 words, keeping 94787 word types
INFO PROGRESS: at sentence #60000, processed 3423883 words, keeping 102821 word types
INFO PROGRESS: at sentence #70000, processed 4002770 words, keeping 110462 word types
INFO PROGRESS: at sentence #80000, processed 4557728 words, keeping 116887 word types
INFO PROGRESS: at sentence #90000, processed 5144062 words, keeping 123187 word types
INFO PROGRESS: at sentence #100000, processed 5728072 words, keeping 129343 word types
INFO PROGRESS: at sentence #110

INFO PROGRESS: at sentence #940000, processed 53596639 words, keeping 348960 word types
INFO PROGRESS: at sentence #950000, processed 54166120 words, keeping 350665 word types
INFO PROGRESS: at sentence #960000, processed 54713911 words, keeping 352210 word types
INFO PROGRESS: at sentence #970000, processed 55273799 words, keeping 354012 word types
INFO PROGRESS: at sentence #980000, processed 55828377 words, keeping 355637 word types
INFO PROGRESS: at sentence #990000, processed 56387418 words, keeping 357335 word types
INFO PROGRESS: at sentence #1000000, processed 56945645 words, keeping 358946 word types
INFO PROGRESS: at sentence #1010000, processed 57510680 words, keeping 360552 word types
INFO PROGRESS: at sentence #1020000, processed 58057572 words, keeping 362136 word types
INFO PROGRESS: at sentence #1030000, processed 58630550 words, keeping 363871 word types
INFO PROGRESS: at sentence #1040000, processed 59199862 words, keeping 365599 word types
INFO PROGRESS: at sentence 

Number of words in vocabulary: 151694


In the previous step we created the vocabulary for our model, it is now time to train! Don't forget to add the following parameters:
1. total_examples=model.corpus_count 
2. epochs=model.epochs

In [132]:
model.train(segments_generator, total_examples=model.corpus_count, epochs=10)

INFO training model with 3 workers on 151694 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
INFO EPOCH 1 - PROGRESS: at 0.82% examples, 737275 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 1.75% examples, 765303 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 2.63% examples, 779533 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 3.54% examples, 785190 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 4.46% examples, 789723 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 5.35% examples, 790797 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 6.14% examples, 780231 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 6.94% examples, 775213 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 7.82% examples, 775775 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 1 - PROGRESS: at 8.72% examples, 779011 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 9.63% exa

INFO EPOCH 1 - PROGRESS: at 86.14% examples, 785452 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 1 - PROGRESS: at 87.03% examples, 785658 words/s, in_qsize 4, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 87.95% examples, 785775 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 88.82% examples, 785787 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 89.71% examples, 786047 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 90.62% examples, 786138 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 91.55% examples, 786408 words/s, in_qsize 4, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 92.45% examples, 786540 words/s, in_qsize 4, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 93.31% examples, 786652 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 94.22% examples, 786745 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 95.11% examples, 786898 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 1 - PROGRESS: at 96.01% examples, 787111 words/s, in_q

INFO EPOCH 2 - PROGRESS: at 69.79% examples, 793181 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 2 - PROGRESS: at 70.72% examples, 793445 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 71.62% examples, 793500 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 72.53% examples, 793486 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 73.43% examples, 793479 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 74.36% examples, 793711 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 2 - PROGRESS: at 75.28% examples, 793691 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 76.19% examples, 793749 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 77.10% examples, 793859 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 78.05% examples, 794035 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 78.82% examples, 792577 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 2 - PROGRESS: at 79.59% examples, 791343 words/s, in_q

INFO EPOCH 3 - PROGRESS: at 47.27% examples, 750975 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 47.79% examples, 745677 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 48.44% examples, 742644 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 49.07% examples, 739560 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 49.90% examples, 739196 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 50.64% examples, 737913 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 51.48% examples, 737953 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 52.34% examples, 738916 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 3 - PROGRESS: at 53.26% examples, 739909 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 54.17% examples, 740728 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 55.08% examples, 741571 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 3 - PROGRESS: at 55.99% examples, 742498 words/s, in_q

INFO EPOCH 4 - PROGRESS: at 18.42% examples, 713897 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 19.20% examples, 714040 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 20.01% examples, 714175 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 20.86% examples, 715968 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 21.69% examples, 715907 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 22.52% examples, 717242 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 4 - PROGRESS: at 23.43% examples, 719572 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 24.30% examples, 721129 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 25.13% examples, 721494 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 25.94% examples, 721647 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 26.82% examples, 723111 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 27.54% examples, 719576 words/s, in_q

INFO EPOCH 4 - PROGRESS: at 98.38% examples, 725657 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 4 - PROGRESS: at 99.19% examples, 725777 words/s, in_qsize 6, out_qsize 0
INFO worker thread finished; awaiting finish of 2 more threads
INFO worker thread finished; awaiting finish of 1 more threads
INFO worker thread finished; awaiting finish of 0 more threads
INFO EPOCH - 4 : training on 92128950 raw words (89312140 effective words) took 123.0s, 725826 effective words/s
INFO EPOCH 5 - PROGRESS: at 0.80% examples, 713009 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 1.69% examples, 741431 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 2.51% examples, 745916 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 5 - PROGRESS: at 3.39% examples, 752993 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 4.22% examples, 750427 words/s, in_qsize 4, out_qsize 2
INFO EPOCH 5 - PROGRESS: at 5.14% examples, 760170 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS:

INFO EPOCH 5 - PROGRESS: at 79.28% examples, 762423 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 80.19% examples, 762806 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 81.10% examples, 763132 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 81.84% examples, 762181 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 82.44% examples, 759748 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 83.17% examples, 758441 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 83.97% examples, 758001 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 84.63% examples, 756224 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 85.31% examples, 754654 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 5 - PROGRESS: at 85.91% examples, 752340 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 86.51% examples, 750222 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 5 - PROGRESS: at 86.99% examples, 747095 words/s, in_q

INFO EPOCH 6 - PROGRESS: at 46.24% examples, 639753 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 6 - PROGRESS: at 47.06% examples, 641522 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 47.87% examples, 642467 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 48.78% examples, 644705 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 49.66% examples, 646733 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 50.54% examples, 648426 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 51.37% examples, 649760 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 52.21% examples, 651458 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 53.10% examples, 653262 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 53.94% examples, 654514 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 6 - PROGRESS: at 54.83% examples, 655953 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 6 - PROGRESS: at 55.69% examples, 657336 words/s, in_q

INFO EPOCH 7 - PROGRESS: at 26.26% examples, 754159 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 27.14% examples, 754601 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 28.03% examples, 755750 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 28.82% examples, 754628 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 29.64% examples, 754080 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 30.47% examples, 753163 words/s, in_qsize 4, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 31.32% examples, 752890 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 7 - PROGRESS: at 32.20% examples, 753640 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 33.05% examples, 753765 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 33.90% examples, 753509 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 34.76% examples, 753260 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 7 - PROGRESS: at 35.59% examples, 752928 words/s, in_q

INFO worker thread finished; awaiting finish of 0 more threads
INFO EPOCH - 7 : training on 92128950 raw words (89312577 effective words) took 126.9s, 704066 effective words/s
INFO EPOCH 8 - PROGRESS: at 0.87% examples, 781525 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 1.68% examples, 739592 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 2.56% examples, 761254 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 3.46% examples, 769570 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 4.36% examples, 777756 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 5.26% examples, 781917 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 6.14% examples, 782044 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 6.93% examples, 773614 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 7.83% examples, 777145 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 8.73% examples, 780582 words/s, in_qsize

INFO EPOCH 8 - PROGRESS: at 83.70% examples, 771598 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 84.60% examples, 771817 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 85.51% examples, 772055 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 8 - PROGRESS: at 86.43% examples, 772382 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 87.35% examples, 772693 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 88.26% examples, 772988 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 89.13% examples, 773241 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 90.04% examples, 773493 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 8 - PROGRESS: at 90.95% examples, 773862 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 91.86% examples, 774058 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 92.76% examples, 774362 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 8 - PROGRESS: at 93.64% examples, 774639 words/s, in_q

INFO EPOCH 9 - PROGRESS: at 65.36% examples, 772558 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 66.12% examples, 771518 words/s, in_qsize 6, out_qsize 2
INFO EPOCH 9 - PROGRESS: at 66.90% examples, 770250 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 67.72% examples, 769487 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 68.61% examples, 769575 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 69.48% examples, 769668 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 70.25% examples, 768678 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 71.09% examples, 768369 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 9 - PROGRESS: at 71.99% examples, 768718 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 72.94% examples, 769112 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 9 - PROGRESS: at 73.83% examples, 769479 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 9 - PROGRESS: at 74.76% examples, 769854 words/s, in_q

INFO EPOCH 10 - PROGRESS: at 40.27% examples, 662406 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 41.07% examples, 663511 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 41.81% examples, 663192 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 10 - PROGRESS: at 42.67% examples, 665198 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 43.55% examples, 667219 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 44.32% examples, 667467 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 45.05% examples, 667139 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 45.84% examples, 667659 words/s, in_qsize 5, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 46.66% examples, 669180 words/s, in_qsize 5, out_qsize 1
INFO EPOCH 10 - PROGRESS: at 47.53% examples, 670440 words/s, in_qsize 6, out_qsize 1
INFO EPOCH 10 - PROGRESS: at 48.44% examples, 672317 words/s, in_qsize 6, out_qsize 0
INFO EPOCH 10 - PROGRESS: at 49.24% examples, 672658 w

(893107185, 921289500)

Test a few words

In [181]:
WORD = u'לב'
x = model.similar_by_word(WORD, topn=30)

In [182]:
for y in x:
    print y[0], y[1]

לבם 0.71956217289
לבו 0.707663297653
הלב 0.697804331779
לבבם 0.692519307137
לבב 0.665616512299
לבות 0.660210371017
לבבו 0.652334213257
נדיבות 0.601741909981
לבך 0.590670347214
בבו 0.576765835285
ליבו 0.562494695187
אבירות 0.551939070225
לב" 0.551247298717
לבבות 0.548554062843
נדיבת 0.547027647495
לבנו 0.537680268288
הש"י 0.536782741547
לבינו 0.530523300171
לבותם 0.528192520142
מעייניו 0.526637136936
יתעורר 0.524576187134
נדכא 0.52455508709
שכלו 0.522397756577
משמחים 0.522246837616
שי"ת 0.520228743553
קרבם 0.520219445229
לבבינו 0.51971924305
תשוקה 0.518667161465
יועצות 0.517000079155
עצבונו 0.516375780106


In [177]:
c = model.most_similar(positive=[u'לחם', u'תפוח'], negative=[u'המוציא'])

In [178]:
for y in c:
    print y[0], y[1]

כבשים 0.494202077389
תולעת 0.468954205513
מקריבין 0.461211115122
עשב 0.449172765017
תחשים 0.446223586798
צבאים 0.441080927849
ללבושך 0.440273523331
שליו 0.43968296051
כבש 0.435782253742
כבשי' 0.433503329754
