In [None]:
from nir.tokenizers import Regex
import os
import pandas as pd

cache_folder = "/backup/MS-MARCO/"
index_name = "ms-marco-docs"
zipped_collection_file = "/backup/MS-MARCO/"+index_name+".tar.gz"

_class = Regex
tk = _class.load_from_json(cache_folder=os.path.join(cache_folder, "tokenizers"), 
                           prefix_name=index_name)

In [None]:
min_freq=5

tk.update_min_word_frequency(min_freq)

tk.vocabulary_size()

In [3]:
from gensim.models.callbacks import CallbackAny2Vec

from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

emb_size = 200
_iter=15
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1
        if self.epoch == 5:
            model.wv.save(fname_or_handle ="/backup/MS-MARCO/word2vec/msmarco2020_gensim_iter_"+str(5)+"_freq"+str(min_freq)+"_"+str(emb_size)+"_"+_class.__name__+"_word2vec.bin")
        else:
            model.wv.save(fname_or_handle ="/backup/MS-MARCO/word2vec/msmarco2020_gensim_iter_"+str(_iter)+"_freq"+str(min_freq)+"_"+str(emb_size)+"_"+_class.__name__+"_word2vec.bin")

In [6]:
# create a collection
import gc
import json
import tarfile
import codecs
import sys 
import time

def collection_iterator(file_name, f_map=None):
    return collection_iterator_fn(file_name=file_name, f_map=f_map)()

def collection_iterator_fn(file_name, f_map=None):
    
    reader = codecs.getreader("ascii")
    tar = tarfile.open(file_name)

    print("[CORPORA] Openning tar file", file_name)

    members = tar.getmembers()
    
    def generator():
        for m in members:
            print("[CORPORA] Openning tar file {}".format(m.name))
            f = tar.extractfile(m)
            articles = json.load(reader(f))
            if f_map is not None:
                articles = list(map(f_map, articles))
            yield articles
            f.close()
            del f
            gc.collect()
    return generator

def load_queries(file):
    df = pd.read_csv("/backup/MS-MARCO/"+file, sep="\t")
    df.columns = ["id", "query"]

    queries = []
    for _,l in df.iterrows():
        queries.append({"query":str(l["query"]),
                        "id":l["id"]})
    
    return queries


def sentences_generator():
    
    corpus_generator = collection_iterator_fn(zipped_collection_file, lambda x:x["text"])
    
    print("MSMARCO queries")
    queries = []
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-doctrain-queries.tsv")))
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-docdev-queries.tsv")))
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-test2019-queries.tsv")))
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-test2020-queries.tsv")))
    
    queries = tk.texts_to_sequences(queries)
    
    for q in queries:
        yield list(map(lambda x: tk.index_word[x], q))
    
    
    print("MSMARCO Docs")
    for subarticles in corpus_generator():
        s_time = time.time()
        subarticles = tk.texts_to_sequences(subarticles)
        print("tokenizer time", time.time() - s_time)
        for article in subarticles:
            yield list(map(lambda x: tk.index_word[x], article))
    

In [7]:
corpus = [x for x in sentences_generator()]

[CORPORA] Openning tar file /backup/MS-MARCO/ms-marco-docs.tar.gz
MSMARCO queries
MSMARCO Docs
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_0000000_to_0500000
tokenizer time 807.1106107234955
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_0500000_to_1000000
tokenizer time 829.1263709068298
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_1000000_to_1500000
tokenizer time 806.5863394737244
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_1500000_to_2000000
tokenizer time 812.6393640041351
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_2000000_to_2500000
tokenizer time 820.1576969623566
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_2500000_to_3000000
tokenizer time 814.6535995006561
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_3000000_to_3213834
tokenizer time 358.30491185188293


In [None]:
model = Word2Vec(corpus, size=emb_size, iter=_iter, window=5, min_count=0, workers=40, sg=1, negative=5, callbacks=[EpochLogger()])

Epoch #0 start
