In [2]:
## Import modules
import pandas as pd
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
import os
#import progressbar as pp
import re
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
import pickle
import gensim

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

  return f(*args, **kwds)


In [7]:
logger.info("## load data - 1 million in total")
#job_description_df = pd.read_csv("/mnt/c/Users/ruihao/Google Drive/Data/job_description_all_1m.csv")
#job_description_df = pd.read_csv("/Users/qiuruihao/Google Drive (ruihqiu@gmail.com)/Data/job_description_all_1m.csv")

2018-10-16 10:10:21,947 : INFO : ## load data - 1 million in total


In [3]:
## pre-analysis
logger.info("# Extract english job descriptions - 400k in total")

# filter out short job description (<100 words)
job_description_en_df = pd.DataFrame(job_description
                                     _df[job_description_df.language_id==1]["job_description"].dropna())
job_description_en_df = job_description_en_df[job_description_en_df["job_description"].apply(lambda x: len(x.split())>100)]

In [8]:
def load_stopwords(stopfile):
    return set(line.strip() for line in open(stopfile))

logger.info("Load stopwords")
stopwords = load_stopwords("stopwords_en.txt")

2018-10-16 10:12:08,510 : INFO : Load stopwords


In [86]:
## An example
n = random.randint(0, len(job_description_en_df))
example = job_description_en_df.iloc[n].values[0]
print(example)

* Salary: Competitive
  * Location: Singapore 
  * Job Type: Permanent, Full time
  * Company: Citibank NA
  * Updated on: 2018-08-24

## Head of Core Warehousing and Big Data Architecture Global Consumer
Technology

Head of Core Warehousing and Big Data Architecture Global Consumer Technology

  * **Primary Location:** Singapore,Singapore,Singapore
  * **Education:** Bachelor's Degree
  * **Job Function:** Technology
  * **Schedule:** Full-time
  * **Shift:** Day Job
  * **Employee Status:** Regular
  * **Travel Time:** Yes, 10 % of the Time
  * **Job ID:** 18047959

  
  
**Description**  
  
**Position Summary:**  
  

  * The Chief Technology Officer (CTO) function is responsible for strategic and operational management of the technology environment supporting Citi's Global Consumer Technology (GCT) function. Key responsibilities of the CTO include: enterprise architecture governance; technology innovation, strategy and roadmap development; software development technology, process 

In [87]:
def paragraph_segment(text):
    text = re.sub(r"[\(\>]http.+?(\s|\n.+?)*[\)\>]", " ", text, re.MULTILINE) ## remove url
    text = re.split("\n[\s]*\n", text) ## segment with double newlines
    text = [re.sub(r"[^\w\&,.;!:()-]", " ", x) for x in text] ## remove special characters
    text = [re.sub(r"\s+", " ", x.strip()) for x in text] ## remove spaces
    text = list(filter(lambda x: len(x.split())>3, text)) ## remove short sentences
    return text

paragraph_segment(example)

['Salary: Competitive Location: Singapore Job Type: Permanent, Full time Company: Citibank NA Updated on: 2018-08-24',
 'Head of Core Warehousing and Big Data Architecture Global Consumer Technology',
 'Head of Core Warehousing and Big Data Architecture Global Consumer Technology',
 'Primary Location: Singapore,Singapore,Singapore Education: Bachelor s Degree Job Function: Technology Schedule: Full-time Shift: Day Job Employee Status: Regular Travel Time: Yes, 10 of the Time Job ID: 18047959',
 'The Chief Technology Officer (CTO) function is responsible for strategic and operational management of the technology environment supporting Citi s Global Consumer Technology (GCT) function. Key responsibilities of the CTO include: enterprise architecture governance; technology innovation, strategy and roadmap development; software development technology, process and design standards; performance engineering and testing; development operations (devops) strategy and execution; operational manage

In [91]:
test_text = "For our client, a dynamically growing organization from the financial sector, we are looking for a person for the position"
stopword_regex = re.compile(r"\b(" + "|".join(stopwords) + r")\b", re.IGNORECASE)

def char_splitter(text):
    char_ = re.compile("[.,;!:()_-]")
    sentences = re.split(char_, text)
    return [x.strip() for x in sentences if x.strip()]

def stopword_splitter(stopword_regex, text):
    sentence = re.sub(stopword_regex, ";", text)
    phrases = re.split(r";+", sentence)
    phrases_output = [x.strip() for x in phrases if x.strip()]
    return phrases_output

def splitter(paragraph_list):
    output_list = []
    long_sentences = []
    for p in paragraph_list:
        long_sentences += char_splitter(p)
    for sent in long_sentences:
        phrases = stopword_splitter(stopword_regex, sent)
        for phrase in phrases:
            if len(phrase.split()) > 3:
                output_list += phrase.split()
            else:
                output_list.append(phrase)
                
    return output_list
  
def remove_numbers(text_list):
    return [x for x in text_list if not x.isdigit() and len(x)>1 or x in ["C", "R"]]

def get_lower_case(text_list):
    case_sensitive_words = ["R", "C", "IT", "MS"]
    return [x.lower() for x in text_list if not x in case_sensitive_words]

word_list = splitter(paragraph_segment(example))
print(get_lower_case(remove_numbers(word_list)))

['salary', 'competitive location', 'singapore job type', 'permanent', 'time company', 'citibank na updated', 'head', 'core warehousing', 'big', 'data', 'architecture', 'global', 'consumer', 'technology', 'head', 'core warehousing', 'big', 'data', 'architecture', 'global', 'consumer', 'technology', 'primary location', 'singapore', 'singapore', 'singapore education', 'bachelor', 'degree', 'job', 'function', 'technology schedule', 'time shift', 'day', 'job', 'employee', 'status', 'regular travel time', 'yes', 'time job id', 'chief technology officer', 'cto', 'function', 'responsible', 'strategic', 'operational management', 'technology', 'environment', 'supporting', 'citi', 'global', 'consumer', 'technology', 'gct', 'function', 'key responsibilities', 'cto include', 'enterprise architecture governance', 'technology innovation', 'strategy', 'roadmap development', 'software development technology', 'process', 'design standards', 'performance engineering', 'testing', 'development operations',

In [92]:
logger.info("## save tokenized text")

tokenized_text = []
for n in tqdm(range(len(job_description_en_df))):
    job = job_description_en_df.iloc[n].values[0]
    tokenized_text.append((get_lower_case(remove_numbers(splitter(paragraph_segment(job))))))

2018-10-16 12:31:46,724 : INFO : ## save tokenized text
100%|██████████| 395434/395434 [43:36<00:00, 151.15it/s]


In [94]:
import pickle

with open('tokenized_text.pkl', 'wb') as f:
    pickle.dump(tokenized_text, f)

In [95]:
model = gensim.models.Word2Vec(tokenized_text, size=500, window=10, min_count=10)
model.save("word2vec_ngram.model")

2018-10-16 14:19:34,601 : INFO : collecting all words and their counts
2018-10-16 14:19:34,616 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-10-16 14:19:36,488 : INFO : PROGRESS: at sentence #10000, processed 1919148 words, keeping 172388 word types
2018-10-16 14:19:38,339 : INFO : PROGRESS: at sentence #20000, processed 4876321 words, keeping 297883 word types
2018-10-16 14:19:40,125 : INFO : PROGRESS: at sentence #30000, processed 7064595 words, keeping 475281 word types
2018-10-16 14:19:41,592 : INFO : PROGRESS: at sentence #40000, processed 9009862 words, keeping 565150 word types
2018-10-16 14:19:43,442 : INFO : PROGRESS: at sentence #50000, processed 11115154 words, keeping 619148 word types
2018-10-16 14:19:44,877 : INFO : PROGRESS: at sentence #60000, processed 13285199 words, keeping 679274 word types
2018-10-16 14:19:47,063 : INFO : PROGRESS: at sentence #70000, processed 15746285 words, keeping 737473 word types
2018-10-16 14:19:48,735 : INF

2018-10-16 14:27:04,147 : INFO : EPOCH 1 - PROGRESS: at 5.65% examples, 133882 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:27:05,234 : INFO : EPOCH 1 - PROGRESS: at 5.89% examples, 135393 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:27:06,339 : INFO : EPOCH 1 - PROGRESS: at 6.10% examples, 136111 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:27:07,410 : INFO : EPOCH 1 - PROGRESS: at 6.37% examples, 138203 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:27:08,462 : INFO : EPOCH 1 - PROGRESS: at 6.79% examples, 142642 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:27:09,476 : INFO : EPOCH 1 - PROGRESS: at 7.15% examples, 147151 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:27:10,490 : INFO : EPOCH 1 - PROGRESS: at 7.56% examples, 151694 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:27:11,502 : INFO : EPOCH 1 - PROGRESS: at 8.00% examples, 156096 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:27:12,567 : INFO : EPOCH 1 - PROGRESS: at 8.36% examples, 158717 words/s, in_qsize 5, out_

2018-10-16 14:28:23,182 : INFO : EPOCH 1 - PROGRESS: at 33.79% examples, 232803 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:28:24,199 : INFO : EPOCH 1 - PROGRESS: at 34.18% examples, 233167 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:28:25,226 : INFO : EPOCH 1 - PROGRESS: at 34.61% examples, 233894 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:28:26,283 : INFO : EPOCH 1 - PROGRESS: at 34.97% examples, 234359 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:28:27,302 : INFO : EPOCH 1 - PROGRESS: at 35.31% examples, 234736 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:28:28,302 : INFO : EPOCH 1 - PROGRESS: at 35.67% examples, 235405 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:28:29,408 : INFO : EPOCH 1 - PROGRESS: at 36.07% examples, 235621 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:28:30,426 : INFO : EPOCH 1 - PROGRESS: at 36.38% examples, 235382 words/s, in_qsize 3, out_qsize 0
2018-10-16 14:28:31,448 : INFO : EPOCH 1 - PROGRESS: at 36.77% examples, 235636 words/s, in_qsiz

2018-10-16 14:29:40,806 : INFO : EPOCH 1 - PROGRESS: at 61.14% examples, 248659 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:29:41,838 : INFO : EPOCH 1 - PROGRESS: at 61.57% examples, 249010 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:29:42,873 : INFO : EPOCH 1 - PROGRESS: at 62.03% examples, 249378 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:29:43,878 : INFO : EPOCH 1 - PROGRESS: at 62.43% examples, 249690 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:29:44,901 : INFO : EPOCH 1 - PROGRESS: at 62.78% examples, 249997 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:29:45,924 : INFO : EPOCH 1 - PROGRESS: at 63.22% examples, 250323 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:29:46,939 : INFO : EPOCH 1 - PROGRESS: at 63.63% examples, 250612 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:29:47,947 : INFO : EPOCH 1 - PROGRESS: at 63.97% examples, 250672 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:29:48,949 : INFO : EPOCH 1 - PROGRESS: at 64.24% examples, 250663 words/s, in_qsiz

2018-10-16 14:30:55,852 : INFO : EPOCH 1 - PROGRESS: at 87.62% examples, 259879 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:30:56,858 : INFO : EPOCH 1 - PROGRESS: at 87.99% examples, 259970 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:30:57,870 : INFO : EPOCH 1 - PROGRESS: at 88.20% examples, 259621 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:30:58,919 : INFO : EPOCH 1 - PROGRESS: at 88.64% examples, 259845 words/s, in_qsize 6, out_qsize 1
2018-10-16 14:30:59,931 : INFO : EPOCH 1 - PROGRESS: at 89.07% examples, 259996 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:31:00,948 : INFO : EPOCH 1 - PROGRESS: at 89.48% examples, 260113 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:31:01,969 : INFO : EPOCH 1 - PROGRESS: at 89.86% examples, 260260 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:31:02,974 : INFO : EPOCH 1 - PROGRESS: at 90.26% examples, 260394 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:31:04,004 : INFO : EPOCH 1 - PROGRESS: at 90.67% examples, 260559 words/s, in_qsiz

2018-10-16 14:32:09,794 : INFO : EPOCH 2 - PROGRESS: at 16.24% examples, 291547 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:32:10,822 : INFO : EPOCH 2 - PROGRESS: at 16.56% examples, 289949 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:32:11,850 : INFO : EPOCH 2 - PROGRESS: at 17.02% examples, 290742 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:32:15,126 : INFO : EPOCH 2 - PROGRESS: at 17.23% examples, 273455 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:32:16,419 : INFO : EPOCH 2 - PROGRESS: at 17.66% examples, 272746 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:32:17,471 : INFO : EPOCH 2 - PROGRESS: at 17.67% examples, 267094 words/s, in_qsize 6, out_qsize 2
2018-10-16 14:32:18,546 : INFO : EPOCH 2 - PROGRESS: at 18.13% examples, 267995 words/s, in_qsize 5, out_qsize 2
2018-10-16 14:32:20,478 : INFO : EPOCH 2 - PROGRESS: at 18.34% examples, 260991 words/s, in_qsize 6, out_qsize 2
2018-10-16 14:32:21,505 : INFO : EPOCH 2 - PROGRESS: at 18.62% examples, 259713 words/s, in_qsiz

2018-10-16 14:33:28,353 : INFO : EPOCH 2 - PROGRESS: at 44.13% examples, 278258 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:33:29,371 : INFO : EPOCH 2 - PROGRESS: at 44.49% examples, 278690 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:33:30,392 : INFO : EPOCH 2 - PROGRESS: at 44.88% examples, 279206 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:33:31,393 : INFO : EPOCH 2 - PROGRESS: at 45.22% examples, 279796 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:33:32,423 : INFO : EPOCH 2 - PROGRESS: at 45.56% examples, 279946 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:33:33,425 : INFO : EPOCH 2 - PROGRESS: at 45.79% examples, 280076 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:33:34,445 : INFO : EPOCH 2 - PROGRESS: at 46.18% examples, 280252 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:33:35,455 : INFO : EPOCH 2 - PROGRESS: at 46.55% examples, 280556 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:33:36,458 : INFO : EPOCH 2 - PROGRESS: at 46.99% examples, 280841 words/s, in_qsiz

2018-10-16 14:34:45,778 : INFO : EPOCH 2 - PROGRESS: at 70.74% examples, 275994 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:34:46,785 : INFO : EPOCH 2 - PROGRESS: at 71.06% examples, 275782 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:34:47,787 : INFO : EPOCH 2 - PROGRESS: at 71.35% examples, 275592 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:34:48,807 : INFO : EPOCH 2 - PROGRESS: at 71.63% examples, 275400 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:34:49,821 : INFO : EPOCH 2 - PROGRESS: at 72.01% examples, 275498 words/s, in_qsize 6, out_qsize 1
2018-10-16 14:34:50,846 : INFO : EPOCH 2 - PROGRESS: at 72.36% examples, 275658 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:34:51,850 : INFO : EPOCH 2 - PROGRESS: at 72.73% examples, 275852 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:34:52,851 : INFO : EPOCH 2 - PROGRESS: at 73.06% examples, 276027 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:34:53,881 : INFO : EPOCH 2 - PROGRESS: at 73.40% examples, 276200 words/s, in_qsiz

2018-10-16 14:36:04,067 : INFO : EPOCH 2 - PROGRESS: at 95.94% examples, 269404 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:36:05,088 : INFO : EPOCH 2 - PROGRESS: at 96.30% examples, 269485 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:36:06,127 : INFO : EPOCH 2 - PROGRESS: at 96.72% examples, 269580 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:36:07,133 : INFO : EPOCH 2 - PROGRESS: at 97.15% examples, 269661 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:36:08,147 : INFO : EPOCH 2 - PROGRESS: at 97.53% examples, 269703 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:36:09,199 : INFO : EPOCH 2 - PROGRESS: at 97.90% examples, 269628 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:36:10,263 : INFO : EPOCH 2 - PROGRESS: at 98.28% examples, 269682 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:36:11,308 : INFO : EPOCH 2 - PROGRESS: at 98.62% examples, 269593 words/s, in_qsize 4, out_qsize 2
2018-10-16 14:36:12,316 : INFO : EPOCH 2 - PROGRESS: at 98.93% examples, 269475 words/s, in_qsiz

2018-10-16 14:37:16,708 : INFO : EPOCH 3 - PROGRESS: at 24.09% examples, 290185 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:37:17,757 : INFO : EPOCH 3 - PROGRESS: at 24.48% examples, 289713 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:37:18,800 : INFO : EPOCH 3 - PROGRESS: at 24.73% examples, 288511 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:37:19,809 : INFO : EPOCH 3 - PROGRESS: at 24.90% examples, 286544 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:37:20,841 : INFO : EPOCH 3 - PROGRESS: at 25.30% examples, 287019 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:37:21,870 : INFO : EPOCH 3 - PROGRESS: at 25.74% examples, 287902 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:37:22,878 : INFO : EPOCH 3 - PROGRESS: at 26.04% examples, 288011 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:37:23,880 : INFO : EPOCH 3 - PROGRESS: at 26.26% examples, 287015 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:37:25,055 : INFO : EPOCH 3 - PROGRESS: at 26.51% examples, 285076 words/s, in_qsiz

2018-10-16 14:38:34,127 : INFO : EPOCH 3 - PROGRESS: at 52.49% examples, 288890 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:38:35,133 : INFO : EPOCH 3 - PROGRESS: at 52.73% examples, 288560 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:38:36,177 : INFO : EPOCH 3 - PROGRESS: at 53.11% examples, 288224 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:38:37,221 : INFO : EPOCH 3 - PROGRESS: at 53.35% examples, 287680 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:38:38,231 : INFO : EPOCH 3 - PROGRESS: at 53.58% examples, 287045 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:38:39,280 : INFO : EPOCH 3 - PROGRESS: at 53.95% examples, 286959 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:38:40,282 : INFO : EPOCH 3 - PROGRESS: at 54.32% examples, 286966 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:38:41,699 : INFO : EPOCH 3 - PROGRESS: at 54.58% examples, 285500 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:38:42,711 : INFO : EPOCH 3 - PROGRESS: at 55.05% examples, 285652 words/s, in_qsiz

2018-10-16 14:39:48,836 : INFO : EPOCH 3 - PROGRESS: at 81.55% examples, 294275 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:39:49,856 : INFO : EPOCH 3 - PROGRESS: at 81.96% examples, 294458 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:39:50,880 : INFO : EPOCH 3 - PROGRESS: at 82.24% examples, 294478 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:39:51,903 : INFO : EPOCH 3 - PROGRESS: at 82.70% examples, 294732 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:39:52,929 : INFO : EPOCH 3 - PROGRESS: at 83.10% examples, 294948 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:39:53,936 : INFO : EPOCH 3 - PROGRESS: at 83.34% examples, 294995 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:39:55,205 : INFO : EPOCH 3 - PROGRESS: at 83.46% examples, 293839 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:39:56,207 : INFO : EPOCH 3 - PROGRESS: at 83.70% examples, 293760 words/s, in_qsize 4, out_qsize 0
2018-10-16 14:39:57,215 : INFO : EPOCH 3 - PROGRESS: at 84.05% examples, 293588 words/s, in_qsiz

2018-10-16 14:41:06,211 : INFO : EPOCH 4 - PROGRESS: at 9.00% examples, 308613 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:41:07,215 : INFO : EPOCH 4 - PROGRESS: at 9.47% examples, 309240 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:41:08,246 : INFO : EPOCH 4 - PROGRESS: at 9.93% examples, 309604 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:41:09,256 : INFO : EPOCH 4 - PROGRESS: at 10.29% examples, 308338 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:41:10,285 : INFO : EPOCH 4 - PROGRESS: at 10.76% examples, 309239 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:41:11,299 : INFO : EPOCH 4 - PROGRESS: at 11.14% examples, 309582 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:41:12,310 : INFO : EPOCH 4 - PROGRESS: at 11.47% examples, 307330 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:41:13,311 : INFO : EPOCH 4 - PROGRESS: at 11.82% examples, 305278 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:41:14,314 : INFO : EPOCH 4 - PROGRESS: at 12.00% examples, 299451 words/s, in_qsize 5

2018-10-16 14:42:20,611 : INFO : EPOCH 4 - PROGRESS: at 37.82% examples, 297501 words/s, in_qsize 5, out_qsize 1
2018-10-16 14:42:21,611 : INFO : EPOCH 4 - PROGRESS: at 38.23% examples, 297936 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:42:22,626 : INFO : EPOCH 4 - PROGRESS: at 38.58% examples, 297854 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:42:23,632 : INFO : EPOCH 4 - PROGRESS: at 39.01% examples, 298317 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:42:24,674 : INFO : EPOCH 4 - PROGRESS: at 39.43% examples, 298745 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:42:25,704 : INFO : EPOCH 4 - PROGRESS: at 39.88% examples, 299221 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:42:26,727 : INFO : EPOCH 4 - PROGRESS: at 40.32% examples, 299734 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:42:27,742 : INFO : EPOCH 4 - PROGRESS: at 40.74% examples, 300150 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:42:28,745 : INFO : EPOCH 4 - PROGRESS: at 41.20% examples, 300473 words/s, in_qsiz

2018-10-16 14:43:39,174 : INFO : EPOCH 4 - PROGRESS: at 67.69% examples, 297285 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:43:40,338 : INFO : EPOCH 4 - PROGRESS: at 67.97% examples, 296692 words/s, in_qsize 5, out_qsize 1
2018-10-16 14:43:41,477 : INFO : EPOCH 4 - PROGRESS: at 68.07% examples, 295226 words/s, in_qsize 4, out_qsize 0
2018-10-16 14:43:42,508 : INFO : EPOCH 4 - PROGRESS: at 68.44% examples, 295263 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:43:43,527 : INFO : EPOCH 4 - PROGRESS: at 68.83% examples, 295318 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:43:44,575 : INFO : EPOCH 4 - PROGRESS: at 69.23% examples, 295447 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:43:45,602 : INFO : EPOCH 4 - PROGRESS: at 69.60% examples, 295503 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:43:46,628 : INFO : EPOCH 4 - PROGRESS: at 69.97% examples, 295686 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:43:47,654 : INFO : EPOCH 4 - PROGRESS: at 70.31% examples, 295600 words/s, in_qsiz

2018-10-16 14:44:53,958 : INFO : EPOCH 4 - PROGRESS: at 96.48% examples, 299757 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:44:54,978 : INFO : EPOCH 4 - PROGRESS: at 96.94% examples, 299789 words/s, in_qsize 4, out_qsize 1
2018-10-16 14:44:55,991 : INFO : EPOCH 4 - PROGRESS: at 97.38% examples, 299851 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:44:57,006 : INFO : EPOCH 4 - PROGRESS: at 97.85% examples, 299957 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:44:58,055 : INFO : EPOCH 4 - PROGRESS: at 98.21% examples, 299872 words/s, in_qsize 5, out_qsize 1
2018-10-16 14:44:59,061 : INFO : EPOCH 4 - PROGRESS: at 98.64% examples, 299975 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:45:00,069 : INFO : EPOCH 4 - PROGRESS: at 99.02% examples, 299928 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:45:01,099 : INFO : EPOCH 4 - PROGRESS: at 99.26% examples, 299336 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:45:02,131 : INFO : EPOCH 4 - PROGRESS: at 99.73% examples, 299450 words/s, in_qsiz

2018-10-16 14:46:11,330 : INFO : EPOCH 5 - PROGRESS: at 23.73% examples, 257561 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:12,350 : INFO : EPOCH 5 - PROGRESS: at 24.19% examples, 258800 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:13,353 : INFO : EPOCH 5 - PROGRESS: at 24.63% examples, 259889 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:14,355 : INFO : EPOCH 5 - PROGRESS: at 25.02% examples, 261316 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:15,383 : INFO : EPOCH 5 - PROGRESS: at 25.45% examples, 262381 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:16,390 : INFO : EPOCH 5 - PROGRESS: at 25.83% examples, 262991 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:17,390 : INFO : EPOCH 5 - PROGRESS: at 26.15% examples, 263826 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:46:18,399 : INFO : EPOCH 5 - PROGRESS: at 26.48% examples, 264300 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:46:19,415 : INFO : EPOCH 5 - PROGRESS: at 26.88% examples, 264995 words/s, in_qsiz

2018-10-16 14:47:25,642 : INFO : EPOCH 5 - PROGRESS: at 53.04% examples, 284623 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:26,657 : INFO : EPOCH 5 - PROGRESS: at 53.39% examples, 284824 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:47:27,687 : INFO : EPOCH 5 - PROGRESS: at 53.80% examples, 285086 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:28,696 : INFO : EPOCH 5 - PROGRESS: at 54.22% examples, 285400 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:29,715 : INFO : EPOCH 5 - PROGRESS: at 54.59% examples, 285314 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:30,717 : INFO : EPOCH 5 - PROGRESS: at 55.04% examples, 285370 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:31,728 : INFO : EPOCH 5 - PROGRESS: at 55.29% examples, 284847 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:32,737 : INFO : EPOCH 5 - PROGRESS: at 55.71% examples, 285328 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:47:33,743 : INFO : EPOCH 5 - PROGRESS: at 56.14% examples, 285730 words/s, in_qsiz

2018-10-16 14:48:43,850 : INFO : EPOCH 5 - PROGRESS: at 81.06% examples, 283036 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:48:44,866 : INFO : EPOCH 5 - PROGRESS: at 81.49% examples, 283299 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:48:45,868 : INFO : EPOCH 5 - PROGRESS: at 81.91% examples, 283589 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:48:46,904 : INFO : EPOCH 5 - PROGRESS: at 82.27% examples, 283923 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:48:47,919 : INFO : EPOCH 5 - PROGRESS: at 82.72% examples, 284184 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:48:48,923 : INFO : EPOCH 5 - PROGRESS: at 83.13% examples, 284511 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:48:49,926 : INFO : EPOCH 5 - PROGRESS: at 83.45% examples, 284968 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:48:50,954 : INFO : EPOCH 5 - PROGRESS: at 83.81% examples, 285297 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:48:51,960 : INFO : EPOCH 5 - PROGRESS: at 84.30% examples, 285574 words/s, in_qsiz

In [96]:
model = gensim.models.Word2Vec.load("word2vec_ngram.model")
model.train(tokenized_text, total_examples=len(tokenized_text), epochs=10)

2018-10-16 14:49:49,729 : INFO : loading Word2Vec object from word2vec_ngram.model
2018-10-16 14:53:31,621 : INFO : loading wv recursively from word2vec_ngram.model.wv.* with mmap=None
2018-10-16 14:53:31,656 : INFO : loading vectors from word2vec_ngram.model.wv.vectors.npy with mmap=None
2018-10-16 14:53:33,850 : INFO : setting ignored attribute vectors_norm to None
2018-10-16 14:53:33,855 : INFO : loading vocabulary recursively from word2vec_ngram.model.vocabulary.* with mmap=None
2018-10-16 14:53:33,858 : INFO : loading trainables recursively from word2vec_ngram.model.trainables.* with mmap=None
2018-10-16 14:53:33,861 : INFO : loading syn1neg from word2vec_ngram.model.trainables.syn1neg.npy with mmap=None
2018-10-16 14:53:35,716 : INFO : setting ignored attribute cum_table to None
2018-10-16 14:53:35,724 : INFO : loaded word2vec_ngram.model
2018-10-16 14:53:37,779 : INFO : training model with 3 workers on 354145 vocabulary and 500 features, using sg=0 hs=0 sample=0.001 negative=5 w

2018-10-16 14:54:52,124 : INFO : EPOCH 1 - PROGRESS: at 25.74% examples, 259903 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:54:53,128 : INFO : EPOCH 1 - PROGRESS: at 26.10% examples, 261126 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:54:54,128 : INFO : EPOCH 1 - PROGRESS: at 26.47% examples, 262136 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:54:55,165 : INFO : EPOCH 1 - PROGRESS: at 26.88% examples, 262907 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:54:56,167 : INFO : EPOCH 1 - PROGRESS: at 27.35% examples, 263795 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:54:57,186 : INFO : EPOCH 1 - PROGRESS: at 27.82% examples, 264897 words/s, in_qsize 6, out_qsize 1
2018-10-16 14:54:58,190 : INFO : EPOCH 1 - PROGRESS: at 28.26% examples, 265711 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:54:59,228 : INFO : EPOCH 1 - PROGRESS: at 28.69% examples, 266016 words/s, in_qsize 4, out_qsize 1
2018-10-16 14:55:00,267 : INFO : EPOCH 1 - PROGRESS: at 29.11% examples, 266374 words/s, in_qsiz

2018-10-16 14:56:10,292 : INFO : EPOCH 1 - PROGRESS: at 55.93% examples, 282060 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:56:11,316 : INFO : EPOCH 1 - PROGRESS: at 56.41% examples, 282381 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:56:12,328 : INFO : EPOCH 1 - PROGRESS: at 56.97% examples, 282900 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:56:13,332 : INFO : EPOCH 1 - PROGRESS: at 57.55% examples, 283326 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:56:14,335 : INFO : EPOCH 1 - PROGRESS: at 58.10% examples, 283816 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:56:15,358 : INFO : EPOCH 1 - PROGRESS: at 58.61% examples, 284228 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:56:16,377 : INFO : EPOCH 1 - PROGRESS: at 59.12% examples, 284621 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:56:17,401 : INFO : EPOCH 1 - PROGRESS: at 59.58% examples, 284894 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:56:18,456 : INFO : EPOCH 1 - PROGRESS: at 60.01% examples, 284950 words/s, in_qsiz

2018-10-16 14:57:33,217 : INFO : EPOCH 1 - PROGRESS: at 84.14% examples, 277664 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:57:34,265 : INFO : EPOCH 1 - PROGRESS: at 84.58% examples, 277913 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:57:35,286 : INFO : EPOCH 1 - PROGRESS: at 84.99% examples, 278099 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:57:36,289 : INFO : EPOCH 1 - PROGRESS: at 85.46% examples, 278320 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:57:37,321 : INFO : EPOCH 1 - PROGRESS: at 85.81% examples, 278165 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:57:38,354 : INFO : EPOCH 1 - PROGRESS: at 86.00% examples, 277676 words/s, in_qsize 5, out_qsize 1
2018-10-16 14:57:42,691 : INFO : EPOCH 1 - PROGRESS: at 86.04% examples, 272906 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:57:43,710 : INFO : EPOCH 1 - PROGRESS: at 86.33% examples, 272661 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:57:44,716 : INFO : EPOCH 1 - PROGRESS: at 86.81% examples, 273074 words/s, in_qsiz

2018-10-16 14:58:54,920 : INFO : EPOCH 2 - PROGRESS: at 11.95% examples, 220932 words/s, in_qsize 6, out_qsize 2
2018-10-16 14:58:55,924 : INFO : EPOCH 2 - PROGRESS: at 12.43% examples, 224786 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:58:56,933 : INFO : EPOCH 2 - PROGRESS: at 12.89% examples, 227874 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:58:57,968 : INFO : EPOCH 2 - PROGRESS: at 13.38% examples, 231665 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:58:58,989 : INFO : EPOCH 2 - PROGRESS: at 13.83% examples, 235197 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:59:00,040 : INFO : EPOCH 2 - PROGRESS: at 14.27% examples, 238232 words/s, in_qsize 5, out_qsize 0
2018-10-16 14:59:01,081 : INFO : EPOCH 2 - PROGRESS: at 14.77% examples, 240699 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:59:02,095 : INFO : EPOCH 2 - PROGRESS: at 15.28% examples, 242936 words/s, in_qsize 6, out_qsize 0
2018-10-16 14:59:03,116 : INFO : EPOCH 2 - PROGRESS: at 15.80% examples, 244573 words/s, in_qsiz

2018-10-16 15:00:09,491 : INFO : EPOCH 2 - PROGRESS: at 43.15% examples, 285547 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:00:10,495 : INFO : EPOCH 2 - PROGRESS: at 43.59% examples, 285710 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:00:11,602 : INFO : EPOCH 2 - PROGRESS: at 43.98% examples, 285280 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:00:12,613 : INFO : EPOCH 2 - PROGRESS: at 44.41% examples, 285861 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:00:13,638 : INFO : EPOCH 2 - PROGRESS: at 44.81% examples, 286543 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:00:14,654 : INFO : EPOCH 2 - PROGRESS: at 45.19% examples, 287352 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:00:15,663 : INFO : EPOCH 2 - PROGRESS: at 45.56% examples, 287720 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:00:16,672 : INFO : EPOCH 2 - PROGRESS: at 45.83% examples, 288075 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:00:17,739 : INFO : EPOCH 2 - PROGRESS: at 46.23% examples, 288171 words/s, in_qsiz

2018-10-16 15:01:30,888 : INFO : EPOCH 2 - PROGRESS: at 70.68% examples, 277814 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:01:31,908 : INFO : EPOCH 2 - PROGRESS: at 71.12% examples, 278112 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:01:32,911 : INFO : EPOCH 2 - PROGRESS: at 71.47% examples, 278172 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:01:33,920 : INFO : EPOCH 2 - PROGRESS: at 71.75% examples, 277892 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:01:34,930 : INFO : EPOCH 2 - PROGRESS: at 72.12% examples, 277974 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:01:35,938 : INFO : EPOCH 2 - PROGRESS: at 72.49% examples, 278159 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:01:36,954 : INFO : EPOCH 2 - PROGRESS: at 72.80% examples, 278284 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:01:37,961 : INFO : EPOCH 2 - PROGRESS: at 73.08% examples, 278133 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:01:38,973 : INFO : EPOCH 2 - PROGRESS: at 73.37% examples, 278057 words/s, in_qsiz

2018-10-16 15:02:52,906 : INFO : EPOCH 2 - PROGRESS: at 99.34% examples, 276298 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:02:53,915 : INFO : EPOCH 2 - PROGRESS: at 99.75% examples, 276389 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:02:54,426 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-16 15:02:54,459 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-16 15:02:54,482 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-16 15:02:54,515 : INFO : EPOCH - 2 : training on 89073054 raw words (77273268 effective words) took 279.6s, 276366 effective words/s
2018-10-16 15:02:55,923 : INFO : EPOCH 3 - PROGRESS: at 0.40% examples, 336061 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:02:56,939 : INFO : EPOCH 3 - PROGRESS: at 0.77% examples, 288281 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:02:57,945 : INFO : EPOCH 3 - PROGRESS: at 1.29% examples, 310192 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:02:58,947 

2018-10-16 15:04:05,454 : INFO : EPOCH 3 - PROGRESS: at 27.92% examples, 299021 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:04:06,481 : INFO : EPOCH 3 - PROGRESS: at 28.42% examples, 299879 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:04:07,499 : INFO : EPOCH 3 - PROGRESS: at 28.92% examples, 300558 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:04:08,525 : INFO : EPOCH 3 - PROGRESS: at 29.40% examples, 301227 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:04:09,526 : INFO : EPOCH 3 - PROGRESS: at 29.79% examples, 301941 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:04:10,536 : INFO : EPOCH 3 - PROGRESS: at 30.16% examples, 302368 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:04:11,554 : INFO : EPOCH 3 - PROGRESS: at 30.61% examples, 302523 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:04:12,579 : INFO : EPOCH 3 - PROGRESS: at 31.06% examples, 302890 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:04:13,593 : INFO : EPOCH 3 - PROGRESS: at 31.55% examples, 303273 words/s, in_qsiz

2018-10-16 15:05:25,419 : INFO : EPOCH 3 - PROGRESS: at 59.06% examples, 299553 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:05:26,425 : INFO : EPOCH 3 - PROGRESS: at 59.55% examples, 299891 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:05:27,458 : INFO : EPOCH 3 - PROGRESS: at 59.99% examples, 299951 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:05:28,480 : INFO : EPOCH 3 - PROGRESS: at 60.54% examples, 300446 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:05:29,498 : INFO : EPOCH 3 - PROGRESS: at 61.07% examples, 300844 words/s, in_qsize 4, out_qsize 1
2018-10-16 15:05:30,523 : INFO : EPOCH 3 - PROGRESS: at 61.49% examples, 300876 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:05:31,540 : INFO : EPOCH 3 - PROGRESS: at 61.78% examples, 300224 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:05:32,548 : INFO : EPOCH 3 - PROGRESS: at 62.26% examples, 300554 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:05:33,589 : INFO : EPOCH 3 - PROGRESS: at 62.67% examples, 300846 words/s, in_qsiz

2018-10-16 15:06:39,987 : INFO : EPOCH 3 - PROGRESS: at 88.77% examples, 306782 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:06:40,998 : INFO : EPOCH 3 - PROGRESS: at 89.04% examples, 306241 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:06:42,027 : INFO : EPOCH 3 - PROGRESS: at 89.51% examples, 306345 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:06:43,030 : INFO : EPOCH 3 - PROGRESS: at 89.91% examples, 306375 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:06:44,037 : INFO : EPOCH 3 - PROGRESS: at 90.33% examples, 306407 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:06:45,053 : INFO : EPOCH 3 - PROGRESS: at 90.78% examples, 306531 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:06:46,066 : INFO : EPOCH 3 - PROGRESS: at 91.26% examples, 306602 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:06:47,111 : INFO : EPOCH 3 - PROGRESS: at 91.60% examples, 306249 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:06:48,119 : INFO : EPOCH 3 - PROGRESS: at 91.91% examples, 305942 words/s, in_qsiz

2018-10-16 15:07:53,641 : INFO : EPOCH 4 - PROGRESS: at 16.25% examples, 275689 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:07:54,643 : INFO : EPOCH 4 - PROGRESS: at 16.74% examples, 277310 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:07:55,676 : INFO : EPOCH 4 - PROGRESS: at 17.20% examples, 278103 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:07:56,697 : INFO : EPOCH 4 - PROGRESS: at 17.66% examples, 279431 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:07:57,706 : INFO : EPOCH 4 - PROGRESS: at 18.15% examples, 281063 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:07:58,716 : INFO : EPOCH 4 - PROGRESS: at 18.65% examples, 282468 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:07:59,742 : INFO : EPOCH 4 - PROGRESS: at 19.07% examples, 282513 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:08:00,762 : INFO : EPOCH 4 - PROGRESS: at 19.55% examples, 283475 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:08:01,802 : INFO : EPOCH 4 - PROGRESS: at 20.01% examples, 284489 words/s, in_qsiz

2018-10-16 15:09:08,356 : INFO : EPOCH 4 - PROGRESS: at 46.31% examples, 298976 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:09:09,365 : INFO : EPOCH 4 - PROGRESS: at 46.66% examples, 298812 words/s, in_qsize 4, out_qsize 0
2018-10-16 15:09:10,365 : INFO : EPOCH 4 - PROGRESS: at 47.01% examples, 298500 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:09:11,406 : INFO : EPOCH 4 - PROGRESS: at 47.44% examples, 298465 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:09:12,419 : INFO : EPOCH 4 - PROGRESS: at 47.86% examples, 298420 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:09:13,432 : INFO : EPOCH 4 - PROGRESS: at 48.27% examples, 298398 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:09:14,433 : INFO : EPOCH 4 - PROGRESS: at 48.65% examples, 298245 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:09:15,446 : INFO : EPOCH 4 - PROGRESS: at 49.09% examples, 298295 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:09:16,457 : INFO : EPOCH 4 - PROGRESS: at 49.48% examples, 298340 words/s, in_qsiz

2018-10-16 15:10:23,510 : INFO : EPOCH 4 - PROGRESS: at 76.95% examples, 307326 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:10:24,577 : INFO : EPOCH 4 - PROGRESS: at 77.36% examples, 307373 words/s, in_qsize 4, out_qsize 1
2018-10-16 15:10:25,595 : INFO : EPOCH 4 - PROGRESS: at 77.75% examples, 307613 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:10:26,607 : INFO : EPOCH 4 - PROGRESS: at 78.14% examples, 307475 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:10:27,621 : INFO : EPOCH 4 - PROGRESS: at 78.64% examples, 307599 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:10:28,654 : INFO : EPOCH 4 - PROGRESS: at 79.20% examples, 307722 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:10:29,710 : INFO : EPOCH 4 - PROGRESS: at 79.69% examples, 307748 words/s, in_qsize 6, out_qsize 1
2018-10-16 15:10:30,721 : INFO : EPOCH 4 - PROGRESS: at 80.15% examples, 307897 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:10:31,730 : INFO : EPOCH 4 - PROGRESS: at 80.61% examples, 308126 words/s, in_qsiz

2018-10-16 15:11:35,043 : INFO : EPOCH 5 - PROGRESS: at 6.04% examples, 304447 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:11:36,050 : INFO : EPOCH 5 - PROGRESS: at 6.47% examples, 306897 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:11:37,056 : INFO : EPOCH 5 - PROGRESS: at 6.92% examples, 308126 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:11:38,083 : INFO : EPOCH 5 - PROGRESS: at 7.30% examples, 308670 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:11:39,110 : INFO : EPOCH 5 - PROGRESS: at 7.78% examples, 309779 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:11:40,149 : INFO : EPOCH 5 - PROGRESS: at 8.17% examples, 307353 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:11:41,164 : INFO : EPOCH 5 - PROGRESS: at 8.61% examples, 309060 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:11:42,169 : INFO : EPOCH 5 - PROGRESS: at 9.05% examples, 308813 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:11:43,177 : INFO : EPOCH 5 - PROGRESS: at 9.58% examples, 311035 words/s, in_qsize 6, out_

2018-10-16 15:12:49,565 : INFO : EPOCH 5 - PROGRESS: at 37.84% examples, 320632 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:12:50,581 : INFO : EPOCH 5 - PROGRESS: at 38.27% examples, 320999 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:12:51,595 : INFO : EPOCH 5 - PROGRESS: at 38.67% examples, 321181 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:12:52,646 : INFO : EPOCH 5 - PROGRESS: at 39.11% examples, 321248 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:12:53,665 : INFO : EPOCH 5 - PROGRESS: at 39.49% examples, 321228 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:12:54,671 : INFO : EPOCH 5 - PROGRESS: at 39.94% examples, 321575 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:12:55,681 : INFO : EPOCH 5 - PROGRESS: at 40.39% examples, 322041 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:12:56,683 : INFO : EPOCH 5 - PROGRESS: at 40.82% examples, 322406 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:12:57,693 : INFO : EPOCH 5 - PROGRESS: at 41.33% examples, 322593 words/s, in_qsiz

2018-10-16 15:14:06,682 : INFO : EPOCH 5 - PROGRESS: at 68.95% examples, 318584 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:07,713 : INFO : EPOCH 5 - PROGRESS: at 69.36% examples, 318606 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:08,724 : INFO : EPOCH 5 - PROGRESS: at 69.69% examples, 318428 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:09,726 : INFO : EPOCH 5 - PROGRESS: at 69.93% examples, 317805 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:10,782 : INFO : EPOCH 5 - PROGRESS: at 70.37% examples, 317947 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:11,794 : INFO : EPOCH 5 - PROGRESS: at 70.64% examples, 317674 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:14:12,802 : INFO : EPOCH 5 - PROGRESS: at 71.07% examples, 317750 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:13,808 : INFO : EPOCH 5 - PROGRESS: at 71.47% examples, 317840 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:14:14,818 : INFO : EPOCH 5 - PROGRESS: at 71.75% examples, 317289 words/s, in_qsiz

2018-10-16 15:15:26,080 : INFO : EPOCH 5 - PROGRESS: at 97.44% examples, 307058 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:15:27,103 : INFO : EPOCH 5 - PROGRESS: at 97.88% examples, 307061 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:15:28,119 : INFO : EPOCH 5 - PROGRESS: at 98.28% examples, 307091 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:15:29,120 : INFO : EPOCH 5 - PROGRESS: at 98.71% examples, 307140 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:15:30,124 : INFO : EPOCH 5 - PROGRESS: at 99.16% examples, 307272 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:15:31,130 : INFO : EPOCH 5 - PROGRESS: at 99.65% examples, 307392 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:15:31,809 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-10-16 15:15:31,833 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-16 15:15:31,841 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-16 15:15:31,843 : INFO : EPOCH - 5 : trai

2018-10-16 15:16:38,319 : INFO : EPOCH 6 - PROGRESS: at 26.64% examples, 303106 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:16:39,335 : INFO : EPOCH 6 - PROGRESS: at 26.98% examples, 302491 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:16:40,340 : INFO : EPOCH 6 - PROGRESS: at 27.28% examples, 301077 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:16:41,343 : INFO : EPOCH 6 - PROGRESS: at 27.67% examples, 301255 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:16:42,345 : INFO : EPOCH 6 - PROGRESS: at 28.17% examples, 302216 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:16:43,379 : INFO : EPOCH 6 - PROGRESS: at 28.66% examples, 302712 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:16:44,391 : INFO : EPOCH 6 - PROGRESS: at 29.16% examples, 303489 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:16:45,393 : INFO : EPOCH 6 - PROGRESS: at 29.58% examples, 303876 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:16:46,410 : INFO : EPOCH 6 - PROGRESS: at 29.96% examples, 304585 words/s, in_qsiz

2018-10-16 15:17:56,250 : INFO : EPOCH 6 - PROGRESS: at 58.10% examples, 307733 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:17:57,290 : INFO : EPOCH 6 - PROGRESS: at 58.64% examples, 308103 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:17:58,411 : INFO : EPOCH 6 - PROGRESS: at 59.14% examples, 308139 words/s, in_qsize 4, out_qsize 2
2018-10-16 15:17:59,423 : INFO : EPOCH 6 - PROGRESS: at 59.63% examples, 308422 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:18:00,440 : INFO : EPOCH 6 - PROGRESS: at 60.13% examples, 308777 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:18:01,450 : INFO : EPOCH 6 - PROGRESS: at 60.67% examples, 309147 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:18:02,458 : INFO : EPOCH 6 - PROGRESS: at 61.20% examples, 309594 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:18:03,460 : INFO : EPOCH 6 - PROGRESS: at 61.61% examples, 309559 words/s, in_qsize 5, out_qsize 1
2018-10-16 15:18:04,632 : INFO : EPOCH 6 - PROGRESS: at 62.01% examples, 309032 words/s, in_qsiz

2018-10-16 15:19:10,479 : INFO : EPOCH 6 - PROGRESS: at 87.32% examples, 310331 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:19:11,500 : INFO : EPOCH 6 - PROGRESS: at 87.61% examples, 310394 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:19:12,532 : INFO : EPOCH 6 - PROGRESS: at 88.02% examples, 310397 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:19:13,582 : INFO : EPOCH 6 - PROGRESS: at 88.44% examples, 310359 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:19:14,599 : INFO : EPOCH 6 - PROGRESS: at 88.82% examples, 310253 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:19:15,623 : INFO : EPOCH 6 - PROGRESS: at 89.29% examples, 310279 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:19:16,635 : INFO : EPOCH 6 - PROGRESS: at 89.67% examples, 310210 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:19:17,655 : INFO : EPOCH 6 - PROGRESS: at 90.04% examples, 310080 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:19:18,657 : INFO : EPOCH 6 - PROGRESS: at 90.44% examples, 310032 words/s, in_qsiz

2018-10-16 15:20:26,270 : INFO : EPOCH 7 - PROGRESS: at 17.62% examples, 288734 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:20:27,286 : INFO : EPOCH 7 - PROGRESS: at 18.05% examples, 289135 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:20:28,302 : INFO : EPOCH 7 - PROGRESS: at 18.55% examples, 290593 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:20:29,500 : INFO : EPOCH 7 - PROGRESS: at 18.97% examples, 289262 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:20:30,536 : INFO : EPOCH 7 - PROGRESS: at 19.42% examples, 289803 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:20:31,551 : INFO : EPOCH 7 - PROGRESS: at 19.92% examples, 291220 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:20:32,588 : INFO : EPOCH 7 - PROGRESS: at 20.38% examples, 292583 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:20:33,604 : INFO : EPOCH 7 - PROGRESS: at 20.86% examples, 293560 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:20:34,624 : INFO : EPOCH 7 - PROGRESS: at 21.31% examples, 294653 words/s, in_qsiz

2018-10-16 15:21:40,758 : INFO : EPOCH 7 - PROGRESS: at 48.88% examples, 311708 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:21:41,782 : INFO : EPOCH 7 - PROGRESS: at 49.38% examples, 312086 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:21:42,808 : INFO : EPOCH 7 - PROGRESS: at 49.85% examples, 312459 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:21:43,833 : INFO : EPOCH 7 - PROGRESS: at 50.26% examples, 312778 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:21:44,862 : INFO : EPOCH 7 - PROGRESS: at 50.67% examples, 312946 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:21:45,876 : INFO : EPOCH 7 - PROGRESS: at 51.11% examples, 313290 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:21:46,900 : INFO : EPOCH 7 - PROGRESS: at 51.52% examples, 313288 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:21:47,927 : INFO : EPOCH 7 - PROGRESS: at 52.03% examples, 313574 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:21:48,957 : INFO : EPOCH 7 - PROGRESS: at 52.44% examples, 314047 words/s, in_qsiz

2018-10-16 15:22:55,360 : INFO : EPOCH 7 - PROGRESS: at 80.18% examples, 318345 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:22:56,380 : INFO : EPOCH 7 - PROGRESS: at 80.65% examples, 318555 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:22:57,409 : INFO : EPOCH 7 - PROGRESS: at 81.08% examples, 318681 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:22:58,412 : INFO : EPOCH 7 - PROGRESS: at 81.53% examples, 318859 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:22:59,442 : INFO : EPOCH 7 - PROGRESS: at 81.97% examples, 319048 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:23:00,444 : INFO : EPOCH 7 - PROGRESS: at 82.33% examples, 319353 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:23:01,457 : INFO : EPOCH 7 - PROGRESS: at 82.77% examples, 319427 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:23:02,472 : INFO : EPOCH 7 - PROGRESS: at 83.14% examples, 319415 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:23:03,485 : INFO : EPOCH 7 - PROGRESS: at 83.38% examples, 319377 words/s, in_qsiz

2018-10-16 15:24:14,358 : INFO : EPOCH 8 - PROGRESS: at 10.29% examples, 235756 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:24:15,362 : INFO : EPOCH 8 - PROGRESS: at 10.76% examples, 238892 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:24:16,395 : INFO : EPOCH 8 - PROGRESS: at 11.17% examples, 241921 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:24:17,406 : INFO : EPOCH 8 - PROGRESS: at 11.61% examples, 244242 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:24:18,439 : INFO : EPOCH 8 - PROGRESS: at 12.09% examples, 246982 words/s, in_qsize 6, out_qsize 1
2018-10-16 15:24:19,474 : INFO : EPOCH 8 - PROGRESS: at 12.53% examples, 249800 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:24:20,491 : INFO : EPOCH 8 - PROGRESS: at 13.01% examples, 252534 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:24:21,492 : INFO : EPOCH 8 - PROGRESS: at 13.43% examples, 255080 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:24:22,623 : INFO : EPOCH 8 - PROGRESS: at 13.72% examples, 254518 words/s, in_qsiz

2018-10-16 15:25:29,075 : INFO : EPOCH 8 - PROGRESS: at 40.49% examples, 287563 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:25:30,091 : INFO : EPOCH 8 - PROGRESS: at 40.91% examples, 288090 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:25:31,105 : INFO : EPOCH 8 - PROGRESS: at 41.49% examples, 288565 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:25:32,120 : INFO : EPOCH 8 - PROGRESS: at 41.91% examples, 288926 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:25:33,133 : INFO : EPOCH 8 - PROGRESS: at 42.27% examples, 289075 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:25:34,135 : INFO : EPOCH 8 - PROGRESS: at 42.63% examples, 289051 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:25:35,139 : INFO : EPOCH 8 - PROGRESS: at 43.13% examples, 289632 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:25:36,144 : INFO : EPOCH 8 - PROGRESS: at 43.64% examples, 290094 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:25:37,157 : INFO : EPOCH 8 - PROGRESS: at 44.14% examples, 290641 words/s, in_qsiz

2018-10-16 15:26:43,597 : INFO : EPOCH 8 - PROGRESS: at 71.22% examples, 302475 words/s, in_qsize 4, out_qsize 0
2018-10-16 15:26:46,635 : INFO : EPOCH 8 - PROGRESS: at 71.23% examples, 297540 words/s, in_qsize 2, out_qsize 1
2018-10-16 15:26:47,651 : INFO : EPOCH 8 - PROGRESS: at 71.27% examples, 296101 words/s, in_qsize 6, out_qsize 1
2018-10-16 15:26:48,671 : INFO : EPOCH 8 - PROGRESS: at 71.70% examples, 296394 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:26:49,698 : INFO : EPOCH 8 - PROGRESS: at 72.17% examples, 296731 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:26:50,722 : INFO : EPOCH 8 - PROGRESS: at 72.59% examples, 297080 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:26:51,723 : INFO : EPOCH 8 - PROGRESS: at 72.95% examples, 297326 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:26:52,736 : INFO : EPOCH 8 - PROGRESS: at 73.34% examples, 297613 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:26:53,745 : INFO : EPOCH 8 - PROGRESS: at 73.69% examples, 297621 words/s, in_qsiz

2018-10-16 15:27:59,095 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-10-16 15:27:59,103 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-10-16 15:27:59,105 : INFO : EPOCH - 8 : training on 89073054 raw words (77270973 effective words) took 256.9s, 300790 effective words/s
2018-10-16 15:28:00,190 : INFO : EPOCH 9 - PROGRESS: at 0.34% examples, 302380 words/s, in_qsize 4, out_qsize 1
2018-10-16 15:28:01,195 : INFO : EPOCH 9 - PROGRESS: at 0.87% examples, 328944 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:28:02,214 : INFO : EPOCH 9 - PROGRESS: at 1.38% examples, 329928 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:28:03,235 : INFO : EPOCH 9 - PROGRESS: at 1.82% examples, 324871 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:28:04,237 : INFO : EPOCH 9 - PROGRESS: at 2.24% examples, 321596 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:28:05,270 : INFO : EPOCH 9 - PROGRESS: at 2.56% examples, 305457 words/s, in_qsize 6, out_qsize 0
2018-

2018-10-16 15:29:11,504 : INFO : EPOCH 9 - PROGRESS: at 30.81% examples, 322728 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:29:12,520 : INFO : EPOCH 9 - PROGRESS: at 31.15% examples, 321751 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:29:13,533 : INFO : EPOCH 9 - PROGRESS: at 31.60% examples, 321436 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:29:14,542 : INFO : EPOCH 9 - PROGRESS: at 32.03% examples, 321602 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:29:15,552 : INFO : EPOCH 9 - PROGRESS: at 32.48% examples, 321641 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:29:16,567 : INFO : EPOCH 9 - PROGRESS: at 32.98% examples, 321846 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:29:17,582 : INFO : EPOCH 9 - PROGRESS: at 33.38% examples, 321595 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:29:18,606 : INFO : EPOCH 9 - PROGRESS: at 33.66% examples, 320049 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:29:19,631 : INFO : EPOCH 9 - PROGRESS: at 34.01% examples, 318997 words/s, in_qsiz

2018-10-16 15:30:32,263 : INFO : EPOCH 9 - PROGRESS: at 61.42% examples, 305586 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:30:33,275 : INFO : EPOCH 9 - PROGRESS: at 61.93% examples, 305884 words/s, in_qsize 4, out_qsize 2
2018-10-16 15:30:34,281 : INFO : EPOCH 9 - PROGRESS: at 62.42% examples, 306353 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:30:35,286 : INFO : EPOCH 9 - PROGRESS: at 62.81% examples, 306563 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:30:36,300 : INFO : EPOCH 9 - PROGRESS: at 63.29% examples, 306838 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:30:37,342 : INFO : EPOCH 9 - PROGRESS: at 63.77% examples, 307164 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:30:38,353 : INFO : EPOCH 9 - PROGRESS: at 64.19% examples, 307490 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:30:39,365 : INFO : EPOCH 9 - PROGRESS: at 64.66% examples, 307779 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:30:40,389 : INFO : EPOCH 9 - PROGRESS: at 65.14% examples, 308112 words/s, in_qsiz

2018-10-16 15:31:46,674 : INFO : EPOCH 9 - PROGRESS: at 91.81% examples, 313265 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:31:47,699 : INFO : EPOCH 9 - PROGRESS: at 92.26% examples, 313357 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:31:48,709 : INFO : EPOCH 9 - PROGRESS: at 92.72% examples, 313485 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:31:49,716 : INFO : EPOCH 9 - PROGRESS: at 93.21% examples, 313601 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:31:50,748 : INFO : EPOCH 9 - PROGRESS: at 93.74% examples, 313807 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:31:51,848 : INFO : EPOCH 9 - PROGRESS: at 94.13% examples, 313579 words/s, in_qsize 4, out_qsize 1
2018-10-16 15:31:52,876 : INFO : EPOCH 9 - PROGRESS: at 94.48% examples, 313282 words/s, in_qsize 6, out_qsize 1
2018-10-16 15:31:53,890 : INFO : EPOCH 9 - PROGRESS: at 94.99% examples, 313466 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:31:54,899 : INFO : EPOCH 9 - PROGRESS: at 95.47% examples, 313666 words/s, in_qsiz

2018-10-16 15:32:57,268 : INFO : EPOCH 10 - PROGRESS: at 22.36% examples, 319671 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:32:58,284 : INFO : EPOCH 10 - PROGRESS: at 22.78% examples, 319919 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:32:59,296 : INFO : EPOCH 10 - PROGRESS: at 23.23% examples, 320385 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:33:00,320 : INFO : EPOCH 10 - PROGRESS: at 23.64% examples, 320540 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:33:01,323 : INFO : EPOCH 10 - PROGRESS: at 24.09% examples, 321069 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:33:02,366 : INFO : EPOCH 10 - PROGRESS: at 24.56% examples, 321118 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:33:03,374 : INFO : EPOCH 10 - PROGRESS: at 24.93% examples, 321624 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:33:04,394 : INFO : EPOCH 10 - PROGRESS: at 25.34% examples, 321756 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:33:05,395 : INFO : EPOCH 10 - PROGRESS: at 25.73% examples, 321548 words/s

2018-10-16 15:34:15,587 : INFO : EPOCH 10 - PROGRESS: at 51.65% examples, 303749 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:34:16,592 : INFO : EPOCH 10 - PROGRESS: at 52.03% examples, 303462 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:34:17,604 : INFO : EPOCH 10 - PROGRESS: at 52.43% examples, 303977 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:34:18,649 : INFO : EPOCH 10 - PROGRESS: at 52.76% examples, 304107 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:34:19,656 : INFO : EPOCH 10 - PROGRESS: at 53.24% examples, 304337 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:34:20,667 : INFO : EPOCH 10 - PROGRESS: at 53.66% examples, 304691 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:34:21,687 : INFO : EPOCH 10 - PROGRESS: at 54.12% examples, 305110 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:34:22,717 : INFO : EPOCH 10 - PROGRESS: at 54.59% examples, 305422 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:34:23,749 : INFO : EPOCH 10 - PROGRESS: at 55.08% examples, 305520 words/s

2018-10-16 15:35:29,274 : INFO : EPOCH 10 - PROGRESS: at 82.39% examples, 312702 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:30,293 : INFO : EPOCH 10 - PROGRESS: at 82.81% examples, 312797 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:31,299 : INFO : EPOCH 10 - PROGRESS: at 83.17% examples, 312835 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:32,302 : INFO : EPOCH 10 - PROGRESS: at 83.49% examples, 313153 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:33,342 : INFO : EPOCH 10 - PROGRESS: at 83.77% examples, 313000 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:34,367 : INFO : EPOCH 10 - PROGRESS: at 84.21% examples, 312937 words/s, in_qsize 6, out_qsize 0
2018-10-16 15:35:35,382 : INFO : EPOCH 10 - PROGRESS: at 84.66% examples, 313139 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:36,420 : INFO : EPOCH 10 - PROGRESS: at 85.11% examples, 313253 words/s, in_qsize 5, out_qsize 0
2018-10-16 15:35:37,439 : INFO : EPOCH 10 - PROGRESS: at 85.57% examples, 313350 words/s

(772711823, 890730540)

In [124]:
## testing

word_vectors = model.wv

test_list_1gram = ["python", "java", "engineer", "excel", "git", "automobile", "finance", "english", "german", "vehicle"]
test_list_2gram = ["machine learning", "project management", "risk management", "data science", "statistical modelling", "big data", "automobile engineering", "human resource"]
test_list_3gram = ["natural language processing", "quality assurance engineering", "ms office excel"]

test_list = test_list_1gram + test_list_2gram + test_list_3gram

for word in test_list:
    print(word, ":\n", word_vectors.most_similar(word, topn=10))

python :
 [('java', 0.5661136507987976), ('programming', 0.5485414266586304), ('sql', 0.5414139032363892), ('scripting languages', 0.5005762577056885), ('scripting', 0.4916435480117798), ('c c', 0.48072144389152527), ('linux', 0.4796450138092041), ('perl', 0.4746547341346741), ('modern programming language', 0.4569898247718811), ('scala', 0.44549480080604553)]
java :
 [('java j2ee', 0.567360520362854), ('python', 0.5661136507987976), ('sql', 0.5426121950149536), ('programming', 0.5244420766830444), ('linux', 0.5222653150558472), ('html5', 0.5066940188407898), ('spring', 0.49310192465782166), ('angular', 0.49159738421440125), ('web', 0.48923367261886597), ('php', 0.4854477047920227)]
engineer :
 [('developer', 0.5396875739097595), ('engineering', 0.4639178514480591), ('architect', 0.4519370198249817), ('engineers', 0.43471527099609375), ('software engineer', 0.3961600065231323), ('senior engineer', 0.37202188372612), ('specialist', 0.3717525601387024), ('r&d software engineering', 0.365

In [133]:
logger.info("## save word vectors")

from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("vectors.kv")
word_vectors.save(fname)
# word_vectors = KeyedVectors.load(fname, mmap='r')

2018-10-16 17:00:25,366 : INFO : ## save word vectors
2018-10-16 17:00:25,371 : INFO : saving Word2VecKeyedVectors object under /tmp/vectors.kv, separately None
2018-10-16 17:00:25,376 : INFO : storing np array 'vectors' to /tmp/vectors.kv.vectors.npy
2018-10-16 17:00:30,336 : INFO : not storing attribute vectors_norm
2018-10-16 17:00:32,662 : INFO : saved /tmp/vectors.kv


In [6]:
from gensim.models import KeyedVectors
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("vectors.kv")
old_word_vectors = KeyedVectors.load(fname, mmap='r')

#for word in test_list:
print("man", ":\n", old_word_vectors.most_similar("man", topn=10))

2018-10-18 15:59:25,834 : INFO : loading Word2VecKeyedVectors object from /tmp/vectors.kv
2018-10-18 15:59:26,854 : INFO : loading vectors from /tmp/vectors.kv.vectors.npy with mmap=r
2018-10-18 15:59:26,867 : INFO : setting ignored attribute vectors_norm to None
2018-10-18 15:59:26,868 : INFO : loaded /tmp/vectors.kv
2018-10-18 15:59:27,011 : INFO : precomputing L2-norms of word weight vectors


man :
 [('flysquare solution basically', 0.31119444966316223), ('jobs matching energy', 0.30456823110580444), ('managing leaves', 0.2859411835670471), ('gas quantitative strategist', 0.28062254190444946), ('fmcg recruitment', 0.2798054814338684), ('shopping malls', 0.2780197858810425), ('leading names', 0.2614626884460449), ('fabrication technology', 0.2609795928001404), ('jobs matching power', 0.25975075364112854), ('statutory requirements managing', 0.25757238268852234)]


In [11]:
def generate_candidate_phrases(text, stopwords):
    coarse_candidates = char_splitter.split(text.lower())
    candidate_phrases = []
    
    for coarse_phrase in coarse_candidates[:1]:
        words = re.split("\s+", coarse_phrase)
        previous_stop = False
        for w in words:
            if w in stopwords and not previous_stop:
                candidate_phrases.append(";")
                previous_stop = True
            elif w not in stopwords:
                candidate_phrases.append(w)
                previous_stop = False
        candidate_phrases.append(";")

    phrases = re.split(";+", ' '.join(candidate_phrases))
    phrases_output = [x.strip() for x in phrases if x.strip()]
    return phrases_output

In [12]:
for text in paragraph_segment(example):
    print(generate_candidate_phrases(text, stopwords))

['client']
['seo technical team']
['miejsce pracy']
['working', 'client']
['working', 'tech seo managers', 'specialists', 'krakow']
['experiance', 'seo', 'technical audits', 'app store optimisation', 'essential']
['opportunity', 'grow', 'international organization']
['osoby zainteresowane prosimy o przesyłanie aplikacji klikając w przycisk aplikowania']
[]
['zastrzegamy sobie możliwość odpowiedzi tylko na wybrane aplikacje']
['agencja wpisana', 'krajowego rejestru agencji zatrudnienia pod numerem 10504']
['interested', 'add', 'cv', 'agreement']
['consent cpl jobs sp']
