In [49]:
#%%time
import nltk
import os
import codecs
import argparse
import numpy as np
import gensim
import itertools
import regex
from six.moves import cPickle as pickle
from os.path import isfile
from collections import Counter
from tqdm import tqdm


class TextCorpus(object):
    """Iterate over sentences from the Text file."""
    
    def __init__(self, filename):
        self.filename = filename
        self.line_count= self.get_line_count()

    def __iter__(self):
        with codecs.open(self.filename, 'r', 'utf-8') as fin:
            for line in tqdm(fin, total=self.line_count):
                words = line.split()
                if not words:
                    continue
                yield words
            
    def get_line_count(self):
        def blocks(files, size=65536):
            while True:
                b = files.read(size)
                if not b: break
                yield b

        with codecs.open(self.filename, 'r', 'utf-8') as f:
            return sum(bl.count("\n") for bl in blocks(f))

# auxiliary finction for progress bar ;)
def get_line_count(filename):
    def blocks(files, size=65536):
        while True:
            b = files.read(size)
            if not b: break
            yield b

    with codecs.open(filename, 'r', 'utf-8') as f:
        return sum(bl.count("\n") for bl in blocks(f))

# Parsing 4.2GB in Wall time: 15min 9s
def get_word_count(txt_file, overwrite=False):
    basename=regex.sub("-pages-articles-multistream","",txt_file[:-4])
    #basename=regex.sub("\/","/w2v-{}-{}-{}-{}-{}-".format(), basename)
    pickle_file="{}-wf.pickle".format(basename)

    if not isfile(pickle_file) or overwrite:
        line_count=get_line_count(txt_file)
        with codecs.open(txt_file, 'r', 'utf-8') as fin:
            # memory efficient; count line by line
            wordcounts = Counter()
            for line in tqdm(fin, total=line_count):
                wordcounts.update(line.split())
#             wordcounts = Counter(itertools.chain.from_iterable([(line.split()) for line in fin]))
        with open(pickle_file, 'wb') as f:
            pickle.dump(wordcounts, f, pickle.HIGHEST_PROTOCOL)
    else:
        with open(pickle_file, 'rb') as f:
            wordcounts=pickle.load(f)
    return wordcounts

def text_file_generator(txt_file):
    with codecs.open(txt_file, 'r', 'utf-8') as fin:
        line_count=get_line_count(txt_file)
        for line in tqdm(fin, total=line_count):
            words = line.split()
            if not words:
                continue
            yield words

def make_wordvectors(txt_file, vector_size=300, window_size=5, vocab_size=50000, num_negative=5, skip_gram=1, save_tsv=False):
    
    wordcouns=get_word_count(txt_file)
    min_count = wordcouns.most_common(vocab_size)[-1][1] # the count of the the top-kth word
    word_count=sum(wordcouns.values())
    
    sentences = TextCorpus(txt_file)
    
    model = gensim.models.Word2Vec(sentences, size=vector_size, min_count=min_count,
                                   negative=num_negative, 
                                   window=window_size,
                                   sg=skip_gram,
                                   workers=10
                                  )
    
    basename=regex.sub("-pages-articles-multistream","",txt_file[:-4])
    basename=regex.sub("\/","/w2v-{}-{}-{}-{}-{}-".format(word_count, vocab_size, vector_size, window_size, num_negative), basename)
    model_file="{}.bin".format(basename)
    model.save(model_file)
    
    if save_tsv:
        # Save to tsv file
        with codecs.open("{}.tsv".format(basename), 'w', 'utf-8') as fout:
            for i, word in enumerate(model.wv.index2word):
                fout.write(u"{}\t{}\t{}\n".format(str(i), word.encode('utf8').decode('utf8'),
                                                  np.array_str(model[word])
                                                  ))



# fd=get_word_count("data/OpenSubtitles2016.txt")
# fd=get_word_count("data/plwikibooks-20170820-pages-articles-multistream.txt", overwrite=True)
# fd=get_word_count("data/plwiktionary-20170820-pages-articles-multistream.txt")
# fd=get_word_count("data/plwiki-20170820-pages-articles-multistream.txt")
# print(fd.most_common(1000000)[-1])

make_wordvectors("data/plwiktionary-20170820-pages-articles-multistream.txt")
#make_wordvectors("data/OpenSubtitles2016.txt")


print ("Done")


100%|██████████| 136078/136078 [00:01<00:00, 84708.70it/s]
100%|██████████| 136078/136078 [00:07<00:00, 18209.55it/s]
100%|██████████| 136078/136078 [00:07<00:00, 17157.83it/s]
100%|██████████| 136078/136078 [00:07<00:00, 17087.01it/s]
100%|██████████| 136078/136078 [00:07<00:00, 17403.33it/s]
100%|██████████| 136078/136078 [00:07<00:00, 17076.52it/s]


Done


775384107

In [25]:
%%time
print (get_line_count("data/OpenSubtitles2016.txt"))

143094531
CPU times: user 9.08 s, sys: 716 ms, total: 9.8 s
Wall time: 9.79 s
