We will first start with the necessary imports. For this project, we will need NLTK (for nlp), Gensim (for Word2Vec), SkLearn, Pandas, and Numby (for data structures and processing).

In [1]:
import os;
import re;
import nltk.data;
import logging;
from gensim.models import word2vec;
from sklearn.cluster import KMeans;
from sklearn.neighbors import KDTree;
import pandas as pd;
import numpy as np;
import sqlite3;



From NLTK, we need to download the package "Punkt", which contains a module for obtaining sentences from a text. The package needs to be downloaded first.

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chawlar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
sql_con = sqlite3.connect('input/reddit-comments-may-2015/database.sqlite');

In [4]:
row_limit = 10000000;

In [5]:
sql_data = pd.read_sql("SELECT body FROM May2015 LIMIT " + str(row_limit), sql_con);

In [6]:
comments_arr = [sql_data.iloc[idx]['body'] for idx in range(len(sql_data))];

In [7]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle');

In [8]:
def clean_text(content_arr):
    cleaned_arr = [];
    
    for pos, val in enumerate(content_arr):
        no_tabs = str(val).replace('\t', ' ').replace('\n', '');
        
        correct_apos = re.sub("\’", "\'", no_tabs);
        alphas_only = re.sub("[^a-zA-Z\'\.]", " ", correct_apos);
        multi_spaces = re.sub(" +", " ", alphas_only);
        
        clean_text = multi_spaces.strip();
        
        sentences = tokenizer.tokenize(clean_text);
        sentences = [re.sub("[\.]", "", sentence).split() for sentence in sentences];
        
        if len(clean_text) > 0 and clean_text.count(' ') > 0:
            cleaned_arr.extend(sentences);
        
    return cleaned_arr;

In [9]:
clean_comments = clean_text(comments_arr)

In [10]:
len(clean_comments)

19827235

In [11]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 8       # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model

print("Training model...");
model = word2vec.Word2Vec(clean_comments, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling);

# We don't plan on training the model any further, so calling 
# init_sims will make the model more memory-efficient.
model.init_sims(replace=True);

model_name = "100features_40minwords_5context_reddit";
model.save(model_name);

2017-08-31 12:24:46,793 : INFO : collecting all words and their counts
2017-08-31 12:24:46,795 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-31 12:24:46,957 : INFO : PROGRESS: at sentence #10000, processed 142673 words, keeping 18815 word types


Training model...


2017-08-31 12:24:47,094 : INFO : PROGRESS: at sentence #20000, processed 287102 words, keeping 29308 word types
2017-08-31 12:24:47,231 : INFO : PROGRESS: at sentence #30000, processed 429872 words, keeping 38124 word types
2017-08-31 12:24:47,386 : INFO : PROGRESS: at sentence #40000, processed 570424 words, keeping 45614 word types
2017-08-31 12:24:47,489 : INFO : PROGRESS: at sentence #50000, processed 709642 words, keeping 52561 word types
2017-08-31 12:24:47,627 : INFO : PROGRESS: at sentence #60000, processed 848223 words, keeping 58451 word types
2017-08-31 12:24:47,760 : INFO : PROGRESS: at sentence #70000, processed 981693 words, keeping 64154 word types
2017-08-31 12:24:47,902 : INFO : PROGRESS: at sentence #80000, processed 1124103 words, keeping 69618 word types
2017-08-31 12:24:48,038 : INFO : PROGRESS: at sentence #90000, processed 1264581 words, keeping 74569 word types
2017-08-31 12:24:48,182 : INFO : PROGRESS: at sentence #100000, processed 1406636 words, keeping 79515

2017-08-31 12:24:53,950 : INFO : PROGRESS: at sentence #740000, processed 10309792 words, keeping 271923 word types
2017-08-31 12:24:54,010 : INFO : PROGRESS: at sentence #750000, processed 10455917 words, keeping 274401 word types
2017-08-31 12:24:54,076 : INFO : PROGRESS: at sentence #760000, processed 10602818 words, keeping 276921 word types
2017-08-31 12:24:54,144 : INFO : PROGRESS: at sentence #770000, processed 10746200 words, keeping 279168 word types
2017-08-31 12:24:54,208 : INFO : PROGRESS: at sentence #780000, processed 10890296 words, keeping 281770 word types
2017-08-31 12:24:54,270 : INFO : PROGRESS: at sentence #790000, processed 11030161 words, keeping 284011 word types
2017-08-31 12:24:54,334 : INFO : PROGRESS: at sentence #800000, processed 11169390 words, keeping 286347 word types
2017-08-31 12:24:54,389 : INFO : PROGRESS: at sentence #810000, processed 11311767 words, keeping 288635 word types
2017-08-31 12:24:54,461 : INFO : PROGRESS: at sentence #820000, processe

2017-08-31 12:24:58,866 : INFO : PROGRESS: at sentence #1450000, processed 21155958 words, keeping 457614 word types
2017-08-31 12:24:58,946 : INFO : PROGRESS: at sentence #1460000, processed 21304060 words, keeping 459902 word types
2017-08-31 12:24:59,022 : INFO : PROGRESS: at sentence #1470000, processed 21461439 words, keeping 462414 word types
2017-08-31 12:24:59,089 : INFO : PROGRESS: at sentence #1480000, processed 21616915 words, keeping 464914 word types
2017-08-31 12:24:59,173 : INFO : PROGRESS: at sentence #1490000, processed 21770992 words, keeping 467229 word types
2017-08-31 12:24:59,232 : INFO : PROGRESS: at sentence #1500000, processed 21922618 words, keeping 469845 word types
2017-08-31 12:24:59,306 : INFO : PROGRESS: at sentence #1510000, processed 22081322 words, keeping 472410 word types
2017-08-31 12:24:59,371 : INFO : PROGRESS: at sentence #1520000, processed 22233405 words, keeping 474622 word types
2017-08-31 12:24:59,438 : INFO : PROGRESS: at sentence #1530000,

2017-08-31 12:25:05,101 : INFO : PROGRESS: at sentence #2160000, processed 32174888 words, keeping 609215 word types
2017-08-31 12:25:05,176 : INFO : PROGRESS: at sentence #2170000, processed 32326488 words, keeping 610942 word types
2017-08-31 12:25:05,240 : INFO : PROGRESS: at sentence #2180000, processed 32482338 words, keeping 612954 word types
2017-08-31 12:25:05,311 : INFO : PROGRESS: at sentence #2190000, processed 32637339 words, keeping 614966 word types
2017-08-31 12:25:05,375 : INFO : PROGRESS: at sentence #2200000, processed 32791465 words, keeping 616928 word types
2017-08-31 12:25:05,452 : INFO : PROGRESS: at sentence #2210000, processed 32944093 words, keeping 618717 word types
2017-08-31 12:25:05,529 : INFO : PROGRESS: at sentence #2220000, processed 33096915 words, keeping 620610 word types
2017-08-31 12:25:05,597 : INFO : PROGRESS: at sentence #2230000, processed 33252630 words, keeping 622615 word types
2017-08-31 12:25:05,667 : INFO : PROGRESS: at sentence #2240000,

2017-08-31 12:25:10,996 : INFO : PROGRESS: at sentence #2870000, processed 43126123 words, keeping 742334 word types
2017-08-31 12:25:11,060 : INFO : PROGRESS: at sentence #2880000, processed 43278973 words, keeping 744231 word types
2017-08-31 12:25:11,137 : INFO : PROGRESS: at sentence #2890000, processed 43431575 words, keeping 746072 word types
2017-08-31 12:25:11,199 : INFO : PROGRESS: at sentence #2900000, processed 43582082 words, keeping 747845 word types
2017-08-31 12:25:11,268 : INFO : PROGRESS: at sentence #2910000, processed 43731617 words, keeping 749517 word types
2017-08-31 12:25:11,350 : INFO : PROGRESS: at sentence #2920000, processed 43886029 words, keeping 751320 word types
2017-08-31 12:25:11,421 : INFO : PROGRESS: at sentence #2930000, processed 44038582 words, keeping 753296 word types
2017-08-31 12:25:11,502 : INFO : PROGRESS: at sentence #2940000, processed 44192481 words, keeping 755106 word types
2017-08-31 12:25:11,576 : INFO : PROGRESS: at sentence #2950000,

2017-08-31 12:25:16,693 : INFO : PROGRESS: at sentence #3580000, processed 53843376 words, keeping 867510 word types
2017-08-31 12:25:16,764 : INFO : PROGRESS: at sentence #3590000, processed 53991419 words, keeping 869643 word types
2017-08-31 12:25:16,840 : INFO : PROGRESS: at sentence #3600000, processed 54139454 words, keeping 871367 word types
2017-08-31 12:25:16,924 : INFO : PROGRESS: at sentence #3610000, processed 54281073 words, keeping 873012 word types
2017-08-31 12:25:16,990 : INFO : PROGRESS: at sentence #3620000, processed 54426783 words, keeping 874602 word types
2017-08-31 12:25:17,058 : INFO : PROGRESS: at sentence #3630000, processed 54570539 words, keeping 876069 word types
2017-08-31 12:25:17,187 : INFO : PROGRESS: at sentence #3640000, processed 54713940 words, keeping 877775 word types
2017-08-31 12:25:17,347 : INFO : PROGRESS: at sentence #3650000, processed 54859583 words, keeping 879384 word types
2017-08-31 12:25:17,425 : INFO : PROGRESS: at sentence #3660000,

2017-08-31 12:25:21,912 : INFO : PROGRESS: at sentence #4290000, processed 63857431 words, keeping 977629 word types
2017-08-31 12:25:21,976 : INFO : PROGRESS: at sentence #4300000, processed 64004222 words, keeping 979258 word types
2017-08-31 12:25:22,050 : INFO : PROGRESS: at sentence #4310000, processed 64147810 words, keeping 980573 word types
2017-08-31 12:25:22,112 : INFO : PROGRESS: at sentence #4320000, processed 64296965 words, keeping 982248 word types
2017-08-31 12:25:22,175 : INFO : PROGRESS: at sentence #4330000, processed 64443891 words, keeping 983919 word types
2017-08-31 12:25:22,244 : INFO : PROGRESS: at sentence #4340000, processed 64590967 words, keeping 985655 word types
2017-08-31 12:25:22,313 : INFO : PROGRESS: at sentence #4350000, processed 64737966 words, keeping 987192 word types
2017-08-31 12:25:22,375 : INFO : PROGRESS: at sentence #4360000, processed 64883305 words, keeping 988793 word types
2017-08-31 12:25:22,446 : INFO : PROGRESS: at sentence #4370000,

2017-08-31 12:25:27,588 : INFO : PROGRESS: at sentence #4980000, processed 74377205 words, keeping 1101576 word types
2017-08-31 12:25:27,677 : INFO : PROGRESS: at sentence #4990000, processed 74528266 words, keeping 1103261 word types
2017-08-31 12:25:27,774 : INFO : PROGRESS: at sentence #5000000, processed 74681901 words, keeping 1104961 word types
2017-08-31 12:25:27,864 : INFO : PROGRESS: at sentence #5010000, processed 74839502 words, keeping 1106621 word types
2017-08-31 12:25:27,957 : INFO : PROGRESS: at sentence #5020000, processed 74989008 words, keeping 1108303 word types
2017-08-31 12:25:28,047 : INFO : PROGRESS: at sentence #5030000, processed 75138452 words, keeping 1109880 word types
2017-08-31 12:25:28,132 : INFO : PROGRESS: at sentence #5040000, processed 75288084 words, keeping 1111593 word types
2017-08-31 12:25:28,205 : INFO : PROGRESS: at sentence #5050000, processed 75438821 words, keeping 1113162 word types
2017-08-31 12:25:28,270 : INFO : PROGRESS: at sentence #

2017-08-31 12:25:32,906 : INFO : PROGRESS: at sentence #5680000, processed 84870246 words, keeping 1212661 word types
2017-08-31 12:25:32,988 : INFO : PROGRESS: at sentence #5690000, processed 85017979 words, keeping 1214213 word types
2017-08-31 12:25:33,060 : INFO : PROGRESS: at sentence #5700000, processed 85163474 words, keeping 1215651 word types
2017-08-31 12:25:33,141 : INFO : PROGRESS: at sentence #5710000, processed 85310509 words, keeping 1217302 word types
2017-08-31 12:25:33,319 : INFO : PROGRESS: at sentence #5720000, processed 85459209 words, keeping 1218867 word types
2017-08-31 12:25:33,409 : INFO : PROGRESS: at sentence #5730000, processed 85605407 words, keeping 1220273 word types
2017-08-31 12:25:33,499 : INFO : PROGRESS: at sentence #5740000, processed 85750253 words, keeping 1221844 word types
2017-08-31 12:25:33,595 : INFO : PROGRESS: at sentence #5750000, processed 85898802 words, keeping 1223392 word types
2017-08-31 12:25:33,681 : INFO : PROGRESS: at sentence #

2017-08-31 12:25:38,671 : INFO : PROGRESS: at sentence #6370000, processed 95115353 words, keeping 1316839 word types
2017-08-31 12:25:38,787 : INFO : PROGRESS: at sentence #6380000, processed 95264490 words, keeping 1318296 word types
2017-08-31 12:25:38,883 : INFO : PROGRESS: at sentence #6390000, processed 95412229 words, keeping 1319696 word types
2017-08-31 12:25:38,981 : INFO : PROGRESS: at sentence #6400000, processed 95558964 words, keeping 1321141 word types
2017-08-31 12:25:39,057 : INFO : PROGRESS: at sentence #6410000, processed 95705292 words, keeping 1322565 word types
2017-08-31 12:25:39,149 : INFO : PROGRESS: at sentence #6420000, processed 95851925 words, keeping 1323979 word types
2017-08-31 12:25:39,219 : INFO : PROGRESS: at sentence #6430000, processed 96001179 words, keeping 1325605 word types
2017-08-31 12:25:39,292 : INFO : PROGRESS: at sentence #6440000, processed 96155192 words, keeping 1327090 word types
2017-08-31 12:25:39,374 : INFO : PROGRESS: at sentence #

2017-08-31 12:25:44,484 : INFO : PROGRESS: at sentence #7070000, processed 105239299 words, keeping 1415401 word types
2017-08-31 12:25:44,578 : INFO : PROGRESS: at sentence #7080000, processed 105382529 words, keeping 1416726 word types
2017-08-31 12:25:44,661 : INFO : PROGRESS: at sentence #7090000, processed 105523083 words, keeping 1417913 word types
2017-08-31 12:25:44,754 : INFO : PROGRESS: at sentence #7100000, processed 105664238 words, keeping 1419170 word types
2017-08-31 12:25:44,843 : INFO : PROGRESS: at sentence #7110000, processed 105805557 words, keeping 1420542 word types
2017-08-31 12:25:44,937 : INFO : PROGRESS: at sentence #7120000, processed 105945476 words, keeping 1421817 word types
2017-08-31 12:25:45,030 : INFO : PROGRESS: at sentence #7130000, processed 106089981 words, keeping 1423164 word types
2017-08-31 12:25:45,126 : INFO : PROGRESS: at sentence #7140000, processed 106234279 words, keeping 1424470 word types
2017-08-31 12:25:45,203 : INFO : PROGRESS: at se

2017-08-31 12:25:50,055 : INFO : PROGRESS: at sentence #7760000, processed 115582937 words, keeping 1517291 word types
2017-08-31 12:25:50,133 : INFO : PROGRESS: at sentence #7770000, processed 115741066 words, keeping 1518905 word types
2017-08-31 12:25:50,210 : INFO : PROGRESS: at sentence #7780000, processed 115899623 words, keeping 1520633 word types
2017-08-31 12:25:50,287 : INFO : PROGRESS: at sentence #7790000, processed 116058807 words, keeping 1522489 word types
2017-08-31 12:25:50,365 : INFO : PROGRESS: at sentence #7800000, processed 116216359 words, keeping 1524017 word types
2017-08-31 12:25:50,448 : INFO : PROGRESS: at sentence #7810000, processed 116369188 words, keeping 1525644 word types
2017-08-31 12:25:50,526 : INFO : PROGRESS: at sentence #7820000, processed 116525729 words, keeping 1527204 word types
2017-08-31 12:25:50,604 : INFO : PROGRESS: at sentence #7830000, processed 116679408 words, keeping 1528846 word types
2017-08-31 12:25:50,681 : INFO : PROGRESS: at se

2017-08-31 12:25:56,126 : INFO : PROGRESS: at sentence #8450000, processed 126147646 words, keeping 1617488 word types
2017-08-31 12:25:56,221 : INFO : PROGRESS: at sentence #8460000, processed 126296909 words, keeping 1618866 word types
2017-08-31 12:25:56,315 : INFO : PROGRESS: at sentence #8470000, processed 126440666 words, keeping 1620209 word types
2017-08-31 12:25:56,410 : INFO : PROGRESS: at sentence #8480000, processed 126591524 words, keeping 1621574 word types
2017-08-31 12:25:56,499 : INFO : PROGRESS: at sentence #8490000, processed 126743698 words, keeping 1622998 word types
2017-08-31 12:25:56,594 : INFO : PROGRESS: at sentence #8500000, processed 126895969 words, keeping 1624344 word types
2017-08-31 12:25:56,690 : INFO : PROGRESS: at sentence #8510000, processed 127047146 words, keeping 1625639 word types
2017-08-31 12:25:56,781 : INFO : PROGRESS: at sentence #8520000, processed 127199499 words, keeping 1627017 word types
2017-08-31 12:25:56,871 : INFO : PROGRESS: at se

2017-08-31 12:26:02,040 : INFO : PROGRESS: at sentence #9140000, processed 136521075 words, keeping 1713013 word types
2017-08-31 12:26:02,118 : INFO : PROGRESS: at sentence #9150000, processed 136675556 words, keeping 1714377 word types
2017-08-31 12:26:02,193 : INFO : PROGRESS: at sentence #9160000, processed 136822512 words, keeping 1715690 word types
2017-08-31 12:26:02,271 : INFO : PROGRESS: at sentence #9170000, processed 136974462 words, keeping 1717019 word types
2017-08-31 12:26:02,349 : INFO : PROGRESS: at sentence #9180000, processed 137126194 words, keeping 1718370 word types
2017-08-31 12:26:02,425 : INFO : PROGRESS: at sentence #9190000, processed 137277105 words, keeping 1719754 word types
2017-08-31 12:26:02,504 : INFO : PROGRESS: at sentence #9200000, processed 137427643 words, keeping 1721115 word types
2017-08-31 12:26:02,587 : INFO : PROGRESS: at sentence #9210000, processed 137578456 words, keeping 1722510 word types
2017-08-31 12:26:02,666 : INFO : PROGRESS: at se

2017-08-31 12:26:07,807 : INFO : PROGRESS: at sentence #9820000, processed 146616764 words, keeping 1802635 word types
2017-08-31 12:26:07,917 : INFO : PROGRESS: at sentence #9830000, processed 146764565 words, keeping 1803882 word types
2017-08-31 12:26:08,018 : INFO : PROGRESS: at sentence #9840000, processed 146908684 words, keeping 1805104 word types
2017-08-31 12:26:08,122 : INFO : PROGRESS: at sentence #9850000, processed 147052534 words, keeping 1806253 word types
2017-08-31 12:26:08,229 : INFO : PROGRESS: at sentence #9860000, processed 147198623 words, keeping 1807506 word types
2017-08-31 12:26:08,334 : INFO : PROGRESS: at sentence #9870000, processed 147342614 words, keeping 1808754 word types
2017-08-31 12:26:08,445 : INFO : PROGRESS: at sentence #9880000, processed 147485253 words, keeping 1809961 word types
2017-08-31 12:26:08,525 : INFO : PROGRESS: at sentence #9890000, processed 147626455 words, keeping 1811252 word types
2017-08-31 12:26:08,621 : INFO : PROGRESS: at se

2017-08-31 12:26:13,819 : INFO : PROGRESS: at sentence #10510000, processed 156807608 words, keeping 1886666 word types
2017-08-31 12:26:13,900 : INFO : PROGRESS: at sentence #10520000, processed 156963318 words, keeping 1887830 word types
2017-08-31 12:26:13,979 : INFO : PROGRESS: at sentence #10530000, processed 157115133 words, keeping 1889140 word types
2017-08-31 12:26:14,059 : INFO : PROGRESS: at sentence #10540000, processed 157267953 words, keeping 1890611 word types
2017-08-31 12:26:14,144 : INFO : PROGRESS: at sentence #10550000, processed 157420004 words, keeping 1891934 word types
2017-08-31 12:26:14,225 : INFO : PROGRESS: at sentence #10560000, processed 157575199 words, keeping 1893245 word types
2017-08-31 12:26:14,304 : INFO : PROGRESS: at sentence #10570000, processed 157731394 words, keeping 1894487 word types
2017-08-31 12:26:14,375 : INFO : PROGRESS: at sentence #10580000, processed 157882119 words, keeping 1895879 word types
2017-08-31 12:26:14,469 : INFO : PROGRES

2017-08-31 12:26:20,098 : INFO : PROGRESS: at sentence #11200000, processed 167643394 words, keeping 1991243 word types
2017-08-31 12:26:20,182 : INFO : PROGRESS: at sentence #11210000, processed 167801171 words, keeping 1992627 word types
2017-08-31 12:26:20,270 : INFO : PROGRESS: at sentence #11220000, processed 167961691 words, keeping 1994157 word types
2017-08-31 12:26:20,356 : INFO : PROGRESS: at sentence #11230000, processed 168117774 words, keeping 1995500 word types
2017-08-31 12:26:20,457 : INFO : PROGRESS: at sentence #11240000, processed 168276090 words, keeping 1996969 word types
2017-08-31 12:26:20,548 : INFO : PROGRESS: at sentence #11250000, processed 168433215 words, keeping 1998231 word types
2017-08-31 12:26:20,628 : INFO : PROGRESS: at sentence #11260000, processed 168587279 words, keeping 1999561 word types
2017-08-31 12:26:20,711 : INFO : PROGRESS: at sentence #11270000, processed 168740688 words, keeping 2000768 word types
2017-08-31 12:26:20,810 : INFO : PROGRES

2017-08-31 12:26:26,209 : INFO : PROGRESS: at sentence #11890000, processed 178459415 words, keeping 2081285 word types
2017-08-31 12:26:26,296 : INFO : PROGRESS: at sentence #11900000, processed 178617944 words, keeping 2082613 word types
2017-08-31 12:26:26,382 : INFO : PROGRESS: at sentence #11910000, processed 178774877 words, keeping 2083898 word types
2017-08-31 12:26:26,462 : INFO : PROGRESS: at sentence #11920000, processed 178929323 words, keeping 2085085 word types
2017-08-31 12:26:26,539 : INFO : PROGRESS: at sentence #11930000, processed 179090236 words, keeping 2086356 word types
2017-08-31 12:26:26,630 : INFO : PROGRESS: at sentence #11940000, processed 179247778 words, keeping 2087700 word types
2017-08-31 12:26:26,718 : INFO : PROGRESS: at sentence #11950000, processed 179410175 words, keeping 2088982 word types
2017-08-31 12:26:26,808 : INFO : PROGRESS: at sentence #11960000, processed 179569452 words, keeping 2090263 word types
2017-08-31 12:26:26,894 : INFO : PROGRES

2017-08-31 12:26:32,341 : INFO : PROGRESS: at sentence #12580000, processed 189331039 words, keeping 2169754 word types
2017-08-31 12:26:32,428 : INFO : PROGRESS: at sentence #12590000, processed 189486436 words, keeping 2171006 word types
2017-08-31 12:26:32,512 : INFO : PROGRESS: at sentence #12600000, processed 189638217 words, keeping 2172211 word types
2017-08-31 12:26:32,597 : INFO : PROGRESS: at sentence #12610000, processed 189790535 words, keeping 2173492 word types
2017-08-31 12:26:32,682 : INFO : PROGRESS: at sentence #12620000, processed 189942551 words, keeping 2174698 word types
2017-08-31 12:26:32,758 : INFO : PROGRESS: at sentence #12630000, processed 190099876 words, keeping 2176051 word types
2017-08-31 12:26:32,848 : INFO : PROGRESS: at sentence #12640000, processed 190255946 words, keeping 2177307 word types
2017-08-31 12:26:32,929 : INFO : PROGRESS: at sentence #12650000, processed 190409294 words, keeping 2178681 word types
2017-08-31 12:26:33,015 : INFO : PROGRES

2017-08-31 12:26:38,866 : INFO : PROGRESS: at sentence #13270000, processed 199939975 words, keeping 2256163 word types
2017-08-31 12:26:38,951 : INFO : PROGRESS: at sentence #13280000, processed 200092991 words, keeping 2257401 word types
2017-08-31 12:26:39,041 : INFO : PROGRESS: at sentence #13290000, processed 200240579 words, keeping 2258544 word types
2017-08-31 12:26:39,148 : INFO : PROGRESS: at sentence #13300000, processed 200392611 words, keeping 2259654 word types
2017-08-31 12:26:39,242 : INFO : PROGRESS: at sentence #13310000, processed 200541971 words, keeping 2260837 word types
2017-08-31 12:26:39,324 : INFO : PROGRESS: at sentence #13320000, processed 200685872 words, keeping 2261886 word types
2017-08-31 12:26:39,408 : INFO : PROGRESS: at sentence #13330000, processed 200829651 words, keeping 2263030 word types
2017-08-31 12:26:39,488 : INFO : PROGRESS: at sentence #13340000, processed 200973329 words, keeping 2264168 word types
2017-08-31 12:26:39,573 : INFO : PROGRES

2017-08-31 12:26:44,837 : INFO : PROGRESS: at sentence #13960000, processed 209889806 words, keeping 2333331 word types
2017-08-31 12:26:44,918 : INFO : PROGRESS: at sentence #13970000, processed 210034065 words, keeping 2334488 word types
2017-08-31 12:26:45,001 : INFO : PROGRESS: at sentence #13980000, processed 210180671 words, keeping 2335574 word types
2017-08-31 12:26:45,089 : INFO : PROGRESS: at sentence #13990000, processed 210328138 words, keeping 2336785 word types
2017-08-31 12:26:45,192 : INFO : PROGRESS: at sentence #14000000, processed 210476492 words, keeping 2337888 word types
2017-08-31 12:26:45,269 : INFO : PROGRESS: at sentence #14010000, processed 210622105 words, keeping 2338966 word types
2017-08-31 12:26:45,355 : INFO : PROGRESS: at sentence #14020000, processed 210766885 words, keeping 2339991 word types
2017-08-31 12:26:45,443 : INFO : PROGRESS: at sentence #14030000, processed 210911983 words, keeping 2341044 word types
2017-08-31 12:26:45,533 : INFO : PROGRES

2017-08-31 12:26:51,821 : INFO : PROGRESS: at sentence #14650000, processed 220483046 words, keeping 2424300 word types
2017-08-31 12:26:51,928 : INFO : PROGRESS: at sentence #14660000, processed 220641033 words, keeping 2425735 word types
2017-08-31 12:26:52,028 : INFO : PROGRESS: at sentence #14670000, processed 220797882 words, keeping 2427249 word types
2017-08-31 12:26:52,143 : INFO : PROGRESS: at sentence #14680000, processed 220955476 words, keeping 2428848 word types
2017-08-31 12:26:52,265 : INFO : PROGRESS: at sentence #14690000, processed 221114093 words, keeping 2430472 word types
2017-08-31 12:26:52,387 : INFO : PROGRESS: at sentence #14700000, processed 221268607 words, keeping 2431858 word types
2017-08-31 12:26:52,503 : INFO : PROGRESS: at sentence #14710000, processed 221425996 words, keeping 2433350 word types
2017-08-31 12:26:52,615 : INFO : PROGRESS: at sentence #14720000, processed 221584860 words, keeping 2434786 word types
2017-08-31 12:26:52,722 : INFO : PROGRES

2017-08-31 12:26:58,906 : INFO : PROGRESS: at sentence #15340000, processed 231355211 words, keeping 2516454 word types
2017-08-31 12:26:58,988 : INFO : PROGRESS: at sentence #15350000, processed 231511424 words, keeping 2517726 word types
2017-08-31 12:26:59,081 : INFO : PROGRESS: at sentence #15360000, processed 231671391 words, keeping 2518959 word types
2017-08-31 12:26:59,182 : INFO : PROGRESS: at sentence #15370000, processed 231831490 words, keeping 2520192 word types
2017-08-31 12:26:59,272 : INFO : PROGRESS: at sentence #15380000, processed 231990543 words, keeping 2521344 word types
2017-08-31 12:26:59,363 : INFO : PROGRESS: at sentence #15390000, processed 232145391 words, keeping 2522536 word types
2017-08-31 12:26:59,447 : INFO : PROGRESS: at sentence #15400000, processed 232307155 words, keeping 2523627 word types
2017-08-31 12:26:59,540 : INFO : PROGRESS: at sentence #15410000, processed 232464518 words, keeping 2524890 word types
2017-08-31 12:26:59,636 : INFO : PROGRES

2017-08-31 12:27:05,597 : INFO : PROGRESS: at sentence #16030000, processed 242206230 words, keeping 2599136 word types
2017-08-31 12:27:05,689 : INFO : PROGRESS: at sentence #16040000, processed 242363210 words, keeping 2600299 word types
2017-08-31 12:27:05,796 : INFO : PROGRESS: at sentence #16050000, processed 242520932 words, keeping 2601572 word types
2017-08-31 12:27:05,898 : INFO : PROGRESS: at sentence #16060000, processed 242681801 words, keeping 2602813 word types
2017-08-31 12:27:05,990 : INFO : PROGRESS: at sentence #16070000, processed 242837356 words, keeping 2604042 word types
2017-08-31 12:27:06,077 : INFO : PROGRESS: at sentence #16080000, processed 242992184 words, keeping 2605220 word types
2017-08-31 12:27:06,189 : INFO : PROGRESS: at sentence #16090000, processed 243146392 words, keeping 2606357 word types
2017-08-31 12:27:06,280 : INFO : PROGRESS: at sentence #16100000, processed 243307392 words, keeping 2607485 word types
2017-08-31 12:27:06,384 : INFO : PROGRES

2017-08-31 12:27:12,381 : INFO : PROGRESS: at sentence #16720000, processed 252897257 words, keeping 2680301 word types
2017-08-31 12:27:12,480 : INFO : PROGRESS: at sentence #16730000, processed 253051165 words, keeping 2681460 word types
2017-08-31 12:27:12,578 : INFO : PROGRESS: at sentence #16740000, processed 253206436 words, keeping 2682607 word types
2017-08-31 12:27:12,680 : INFO : PROGRESS: at sentence #16750000, processed 253360583 words, keeping 2683940 word types
2017-08-31 12:27:12,776 : INFO : PROGRESS: at sentence #16760000, processed 253518819 words, keeping 2685058 word types
2017-08-31 12:27:12,874 : INFO : PROGRESS: at sentence #16770000, processed 253672937 words, keeping 2686213 word types
2017-08-31 12:27:12,976 : INFO : PROGRESS: at sentence #16780000, processed 253828276 words, keeping 2687511 word types
2017-08-31 12:27:13,080 : INFO : PROGRESS: at sentence #16790000, processed 253982303 words, keeping 2688687 word types
2017-08-31 12:27:13,172 : INFO : PROGRES

2017-08-31 12:27:18,822 : INFO : PROGRESS: at sentence #17410000, processed 263260358 words, keeping 2758711 word types
2017-08-31 12:27:18,914 : INFO : PROGRESS: at sentence #17420000, processed 263400389 words, keeping 2759647 word types
2017-08-31 12:27:19,007 : INFO : PROGRESS: at sentence #17430000, processed 263547062 words, keeping 2760743 word types
2017-08-31 12:27:19,099 : INFO : PROGRESS: at sentence #17440000, processed 263683209 words, keeping 2761727 word types
2017-08-31 12:27:19,192 : INFO : PROGRESS: at sentence #17450000, processed 263826340 words, keeping 2762653 word types
2017-08-31 12:27:19,283 : INFO : PROGRESS: at sentence #17460000, processed 263966626 words, keeping 2763649 word types
2017-08-31 12:27:19,374 : INFO : PROGRESS: at sentence #17470000, processed 264111008 words, keeping 2764654 word types
2017-08-31 12:27:19,472 : INFO : PROGRESS: at sentence #17480000, processed 264254283 words, keeping 2765782 word types
2017-08-31 12:27:19,565 : INFO : PROGRES

2017-08-31 12:27:24,721 : INFO : PROGRESS: at sentence #18100000, processed 273583755 words, keeping 2830899 word types
2017-08-31 12:27:24,795 : INFO : PROGRESS: at sentence #18110000, processed 273739617 words, keeping 2832212 word types
2017-08-31 12:27:24,863 : INFO : PROGRESS: at sentence #18120000, processed 273890745 words, keeping 2833564 word types
2017-08-31 12:27:24,939 : INFO : PROGRESS: at sentence #18130000, processed 274043877 words, keeping 2834718 word types
2017-08-31 12:27:25,010 : INFO : PROGRESS: at sentence #18140000, processed 274194755 words, keeping 2835864 word types
2017-08-31 12:27:25,091 : INFO : PROGRESS: at sentence #18150000, processed 274346451 words, keeping 2837137 word types
2017-08-31 12:27:25,175 : INFO : PROGRESS: at sentence #18160000, processed 274504437 words, keeping 2838187 word types
2017-08-31 12:27:25,254 : INFO : PROGRESS: at sentence #18170000, processed 274659590 words, keeping 2839436 word types
2017-08-31 12:27:25,341 : INFO : PROGRES

2017-08-31 12:27:30,541 : INFO : PROGRESS: at sentence #18790000, processed 284465117 words, keeping 2924266 word types
2017-08-31 12:27:30,628 : INFO : PROGRESS: at sentence #18800000, processed 284624309 words, keeping 2925475 word types
2017-08-31 12:27:30,710 : INFO : PROGRESS: at sentence #18810000, processed 284780161 words, keeping 2926603 word types
2017-08-31 12:27:30,790 : INFO : PROGRESS: at sentence #18820000, processed 284934736 words, keeping 2927901 word types
2017-08-31 12:27:30,867 : INFO : PROGRESS: at sentence #18830000, processed 285088046 words, keeping 2929046 word types
2017-08-31 12:27:30,965 : INFO : PROGRESS: at sentence #18840000, processed 285243827 words, keeping 2930117 word types
2017-08-31 12:27:31,045 : INFO : PROGRESS: at sentence #18850000, processed 285401523 words, keeping 2931359 word types
2017-08-31 12:27:31,135 : INFO : PROGRESS: at sentence #18860000, processed 285556328 words, keeping 2932555 word types
2017-08-31 12:27:31,208 : INFO : PROGRES

2017-08-31 12:27:36,306 : INFO : PROGRESS: at sentence #19480000, processed 295207063 words, keeping 3002346 word types
2017-08-31 12:27:36,384 : INFO : PROGRESS: at sentence #19490000, processed 295358926 words, keeping 3003487 word types
2017-08-31 12:27:36,474 : INFO : PROGRESS: at sentence #19500000, processed 295513269 words, keeping 3004602 word types
2017-08-31 12:27:36,553 : INFO : PROGRESS: at sentence #19510000, processed 295669982 words, keeping 3005768 word types
2017-08-31 12:27:36,634 : INFO : PROGRESS: at sentence #19520000, processed 295825955 words, keeping 3006975 word types
2017-08-31 12:27:36,711 : INFO : PROGRESS: at sentence #19530000, processed 295978610 words, keeping 3008051 word types
2017-08-31 12:27:36,800 : INFO : PROGRESS: at sentence #19540000, processed 296136601 words, keeping 3009154 word types
2017-08-31 12:27:36,879 : INFO : PROGRESS: at sentence #19550000, processed 296290406 words, keeping 3010261 word types
2017-08-31 12:27:36,973 : INFO : PROGRES

2017-08-31 12:28:13,560 : INFO : PROGRESS: at 3.29% examples, 1273173 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:28:14,572 : INFO : PROGRESS: at 3.39% examples, 1271482 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:28:15,586 : INFO : PROGRESS: at 3.50% examples, 1270802 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:28:16,595 : INFO : PROGRESS: at 3.61% examples, 1267270 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:28:17,608 : INFO : PROGRESS: at 3.71% examples, 1265214 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:28:18,627 : INFO : PROGRESS: at 3.83% examples, 1264240 words/s, in_qsize 16, out_qsize 3
2017-08-31 12:28:19,647 : INFO : PROGRESS: at 3.96% examples, 1267020 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:28:20,661 : INFO : PROGRESS: at 4.08% examples, 1269789 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:28:21,674 : INFO : PROGRESS: at 4.21% examples, 1270604 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:28:22,675 : INFO : PROGRESS: at 4.33% examples, 12

2017-08-31 12:29:33,387 : INFO : PROGRESS: at 10.93% examples, 1154240 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:29:34,395 : INFO : PROGRESS: at 11.00% examples, 1150673 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:29:35,418 : INFO : PROGRESS: at 11.06% examples, 1147638 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:29:36,419 : INFO : PROGRESS: at 11.11% examples, 1142051 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:29:37,419 : INFO : PROGRESS: at 11.18% examples, 1139235 words/s, in_qsize 16, out_qsize 1
2017-08-31 12:29:38,453 : INFO : PROGRESS: at 11.22% examples, 1133952 words/s, in_qsize 16, out_qsize 2
2017-08-31 12:29:39,462 : INFO : PROGRESS: at 11.26% examples, 1128403 words/s, in_qsize 15, out_qsize 4
2017-08-31 12:29:40,506 : INFO : PROGRESS: at 11.34% examples, 1125871 words/s, in_qsize 16, out_qsize 3
2017-08-31 12:29:41,512 : INFO : PROGRESS: at 11.42% examples, 1125471 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:29:42,579 : INFO : PROGRESS: at 11.47% ex

2017-08-31 12:30:53,344 : INFO : PROGRESS: at 17.87% examples, 1104915 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:30:54,344 : INFO : PROGRESS: at 17.98% examples, 1105438 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:30:55,350 : INFO : PROGRESS: at 18.08% examples, 1105887 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:30:56,361 : INFO : PROGRESS: at 18.18% examples, 1105942 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:30:57,374 : INFO : PROGRESS: at 18.28% examples, 1106218 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:30:58,401 : INFO : PROGRESS: at 18.38% examples, 1106685 words/s, in_qsize 14, out_qsize 3
2017-08-31 12:30:59,412 : INFO : PROGRESS: at 18.47% examples, 1106634 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:31:00,422 : INFO : PROGRESS: at 18.58% examples, 1108057 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:31:01,435 : INFO : PROGRESS: at 18.69% examples, 1109413 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:31:02,448 : INFO : PROGRESS: at 18.81% ex

2017-08-31 12:32:13,000 : INFO : PROGRESS: at 27.13% examples, 1179413 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:32:14,007 : INFO : PROGRESS: at 27.26% examples, 1180301 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:32:15,013 : INFO : PROGRESS: at 27.39% examples, 1181154 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:32:16,020 : INFO : PROGRESS: at 27.51% examples, 1182112 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:32:17,021 : INFO : PROGRESS: at 27.63% examples, 1182792 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:32:18,026 : INFO : PROGRESS: at 27.73% examples, 1183280 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:32:19,031 : INFO : PROGRESS: at 27.85% examples, 1183986 words/s, in_qsize 16, out_qsize 1
2017-08-31 12:32:20,051 : INFO : PROGRESS: at 27.96% examples, 1184542 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:32:21,058 : INFO : PROGRESS: at 28.08% examples, 1185235 words/s, in_qsize 16, out_qsize 1
2017-08-31 12:32:22,060 : INFO : PROGRESS: at 28.20% ex

2017-08-31 12:33:32,697 : INFO : PROGRESS: at 35.76% examples, 1203049 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:33:33,701 : INFO : PROGRESS: at 35.87% examples, 1203614 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:33:34,718 : INFO : PROGRESS: at 35.99% examples, 1204155 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:33:35,721 : INFO : PROGRESS: at 36.10% examples, 1204515 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:33:36,723 : INFO : PROGRESS: at 36.20% examples, 1204526 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:33:37,732 : INFO : PROGRESS: at 36.30% examples, 1204718 words/s, in_qsize 16, out_qsize 1
2017-08-31 12:33:38,748 : INFO : PROGRESS: at 36.41% examples, 1204901 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:33:39,749 : INFO : PROGRESS: at 36.53% examples, 1205488 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:33:40,761 : INFO : PROGRESS: at 36.65% examples, 1206031 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:33:41,769 : INFO : PROGRESS: at 36.77% ex

2017-08-31 12:34:52,374 : INFO : PROGRESS: at 44.79% examples, 1227338 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:34:53,377 : INFO : PROGRESS: at 44.91% examples, 1227680 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:34:54,386 : INFO : PROGRESS: at 45.03% examples, 1228060 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:34:55,399 : INFO : PROGRESS: at 45.14% examples, 1228301 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:34:56,409 : INFO : PROGRESS: at 45.26% examples, 1228744 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:34:57,417 : INFO : PROGRESS: at 45.39% examples, 1229228 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:34:58,419 : INFO : PROGRESS: at 45.51% examples, 1229587 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:34:59,422 : INFO : PROGRESS: at 45.62% examples, 1229853 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:35:00,441 : INFO : PROGRESS: at 45.74% examples, 1230196 words/s, in_qsize 16, out_qsize 2
2017-08-31 12:35:01,441 : INFO : PROGRESS: at 45.87% ex

2017-08-31 12:36:12,008 : INFO : PROGRESS: at 54.52% examples, 1259423 words/s, in_qsize 11, out_qsize 4
2017-08-31 12:36:13,022 : INFO : PROGRESS: at 54.64% examples, 1259799 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:36:14,022 : INFO : PROGRESS: at 54.76% examples, 1260239 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:36:15,038 : INFO : PROGRESS: at 54.88% examples, 1260610 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:36:16,048 : INFO : PROGRESS: at 55.00% examples, 1260961 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:36:17,050 : INFO : PROGRESS: at 55.12% examples, 1261283 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:36:18,050 : INFO : PROGRESS: at 55.24% examples, 1261701 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:36:19,058 : INFO : PROGRESS: at 55.36% examples, 1262084 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:36:20,060 : INFO : PROGRESS: at 55.48% examples, 1262452 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:36:21,061 : INFO : PROGRESS: at 55.59% ex

2017-08-31 12:37:31,507 : INFO : PROGRESS: at 64.32% examples, 1286206 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:37:32,518 : INFO : PROGRESS: at 64.45% examples, 1286580 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:37:33,519 : INFO : PROGRESS: at 64.58% examples, 1286979 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:37:34,527 : INFO : PROGRESS: at 64.71% examples, 1287276 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:37:35,537 : INFO : PROGRESS: at 64.82% examples, 1287389 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:37:36,548 : INFO : PROGRESS: at 64.94% examples, 1287534 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:37:37,560 : INFO : PROGRESS: at 65.06% examples, 1287833 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:37:38,563 : INFO : PROGRESS: at 65.18% examples, 1288072 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:37:39,572 : INFO : PROGRESS: at 65.31% examples, 1288361 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:37:40,584 : INFO : PROGRESS: at 65.44% ex

2017-08-31 12:38:51,077 : INFO : PROGRESS: at 74.06% examples, 1304007 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:38:52,080 : INFO : PROGRESS: at 74.18% examples, 1304183 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:38:53,083 : INFO : PROGRESS: at 74.31% examples, 1304425 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:38:54,100 : INFO : PROGRESS: at 74.43% examples, 1304569 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:38:55,111 : INFO : PROGRESS: at 74.55% examples, 1304794 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:38:56,112 : INFO : PROGRESS: at 74.67% examples, 1305034 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:38:57,125 : INFO : PROGRESS: at 74.79% examples, 1305333 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:38:58,126 : INFO : PROGRESS: at 74.92% examples, 1305588 words/s, in_qsize 16, out_qsize 2
2017-08-31 12:38:59,126 : INFO : PROGRESS: at 75.04% examples, 1305837 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:39:00,130 : INFO : PROGRESS: at 75.15% ex

2017-08-31 12:40:10,663 : INFO : PROGRESS: at 83.85% examples, 1320497 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:40:11,668 : INFO : PROGRESS: at 83.98% examples, 1320666 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:40:12,670 : INFO : PROGRESS: at 84.11% examples, 1320783 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:40:13,690 : INFO : PROGRESS: at 84.24% examples, 1320849 words/s, in_qsize 16, out_qsize 1
2017-08-31 12:40:14,693 : INFO : PROGRESS: at 84.37% examples, 1321077 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:40:15,694 : INFO : PROGRESS: at 84.50% examples, 1321206 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:40:16,696 : INFO : PROGRESS: at 84.62% examples, 1321438 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:40:17,711 : INFO : PROGRESS: at 84.75% examples, 1321685 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:40:18,720 : INFO : PROGRESS: at 84.87% examples, 1321895 words/s, in_qsize 16, out_qsize 1
2017-08-31 12:40:19,722 : INFO : PROGRESS: at 85.00% ex

2017-08-31 12:41:30,174 : INFO : PROGRESS: at 93.78% examples, 1334092 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:41:31,179 : INFO : PROGRESS: at 93.91% examples, 1334271 words/s, in_qsize 16, out_qsize 0
2017-08-31 12:41:32,187 : INFO : PROGRESS: at 94.04% examples, 1334452 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:41:33,202 : INFO : PROGRESS: at 94.17% examples, 1334614 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:41:34,211 : INFO : PROGRESS: at 94.29% examples, 1334538 words/s, in_qsize 12, out_qsize 3
2017-08-31 12:41:35,229 : INFO : PROGRESS: at 94.39% examples, 1334408 words/s, in_qsize 14, out_qsize 1
2017-08-31 12:41:36,230 : INFO : PROGRESS: at 94.49% examples, 1334240 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:41:37,237 : INFO : PROGRESS: at 94.61% examples, 1334306 words/s, in_qsize 15, out_qsize 0
2017-08-31 12:41:38,254 : INFO : PROGRESS: at 94.72% examples, 1334440 words/s, in_qsize 13, out_qsize 2
2017-08-31 12:41:39,265 : INFO : PROGRESS: at 94.85% ex

In [12]:
Z = model.wv.syn0;

In [13]:
def clustering_on_wordvecs(word_vectors, num_clusters):
    # Initalize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans(n_clusters = num_clusters);
    idx = kmeans_clustering.fit_predict(word_vectors);
    
    return kmeans_clustering.cluster_centers_, idx;

In [14]:
centers, clusters = clustering_on_wordvecs(Z, 10);

In [15]:
centroid_map = dict(zip(model.wv.index2word, clusters));

In [16]:
'''
Returns a DataFrame of the Closest words to each Cluster center. The closest neighbors are obtained using a \
KDTree, which is trained on the provided Word vectors. The KDTree is then queried for the top 20 words\
which are then used as indices in the Word index representation. The results are added to a dictionary,\
which is used to create and return a DataFrame.

'''
def get_top20_words(index2word, centers, wordvecs):
    tree = KDTree(wordvecs);

    #Closest points for each Cluster center is used to query the closest 20 points to it.
    closest_points = [tree.query(np.reshape(x, (1, -1)), k=20) for x in centers];
    closest_words_idxs = [x[1] for x in closest_points];

    #Word Index is queried for each position in the above array, and added to a Dictionary.
    closest_words = {};
    for i in range(0, len(closest_words_idxs)):
        closest_words['Cluster #' + str(i)] = [index2word[j] for j in closest_words_idxs[i][0]]

    #A DataFrame is generated from the dictionary.
    df = pd.DataFrame(closest_words);
    df.index = df.index+1

    return df;

In [17]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('regent', 0.9203996062278748),
 ('queen', 0.8904438018798828),
 ('emperor', 0.8728556632995605),
 ('pope', 0.8679628968238831),
 ('Targaryen', 0.8676916360855103),
 ('duchess', 0.8671002984046936),
 ('Queen', 0.8581911325454712),
 ('monarch', 0.8581573963165283),
 ('heir', 0.853079617023468),
 ('royal', 0.8514959812164307)]

In [18]:
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [19]:
model.wv.similar_by_word('queen')

[('princess', 0.8127622008323669),
 ('king', 0.7135525345802307),
 ('lion', 0.7047721147537231),
 ('prince', 0.7032004594802856),
 ('vampire', 0.6717629432678223),
 ('dwarf', 0.6700363159179688),
 ('wolf', 0.6470633149147034),
 ('conqueror', 0.6467578411102295),
 ('pearl', 0.6412856578826904),
 ('sorceress', 0.6389750838279724)]

In [20]:
model.wv.word_vec('amazon')

array([-0.07923435,  0.05483854,  0.01097845,  0.04422838, -0.07037681,
        0.10734435,  0.16134924,  0.12160344,  0.02969441,  0.03430137,
       -0.11568732, -0.12524167, -0.07121836,  0.04380697,  0.1013496 ,
        0.12472779,  0.04589591,  0.13760653,  0.04698094,  0.03496061,
        0.20014355,  0.00040228,  0.13380508,  0.19375399,  0.21526089,
        0.09709407,  0.09275255,  0.04370517,  0.22054465,  0.05111407,
        0.15190397,  0.01703347,  0.11764386,  0.03764349, -0.24294493,
        0.09849081,  0.17770456, -0.03950332, -0.04528545, -0.08315247,
       -0.12004509, -0.02640626, -0.02080552,  0.07661081, -0.0272052 ,
        0.08929471, -0.020763  , -0.05887021,  0.04686536, -0.17885886,
        0.16854522, -0.00171533,  0.02966417,  0.11292598,  0.16883048,
       -0.11760147, -0.0163983 ,  0.05589611, -0.02088387,  0.10412354,
        0.05327627,  0.01598939, -0.17672792, -0.0488485 , -0.00899526,
        0.01144994, -0.04392258, -0.14943314, -0.11008292, -0.04