In [1]:
import gc
import json
import tarfile
import codecs
import sys 

index_name = "ms-marco-docs"
zipped_collection_file = "/backup/MS-MARCO/"+index_name+".tar.gz"

def collection_iterator(file_name, f_map=None):
    return collection_iterator_fn(file_name=file_name, f_map=f_map)()

def collection_iterator_fn(file_name, f_map=None):
    
    reader = codecs.getreader("ascii")
    tar = tarfile.open(file_name)

    print("[CORPORA] Openning tar file", file_name)

    members = tar.getmembers()
    
    def generator():
        for m in members:
            print("[CORPORA] Openning tar file {}".format(m.name))
            f = tar.extractfile(m)
            articles = json.load(reader(f))
            if f_map is not None:
                articles = list(map(f_map, articles))
            yield articles
            f.close()
            del f
            gc.collect()
    return generator

In [None]:
from nir.tokenizers import Regex, BioCleanTokenizer, BioCleanTokenizer2
import os

tk = Regex(cache_folder=os.path.join(cache_folder, "tokenizers"),
            prefix_name=index_name,
            min_word_frequency=5,
            n_process=12)

def load_queres(file):
    df = pd.read_csv("/backup/MS-MARCO/"+file, sep="\t")
    df.columns = ["id", "query"]

    queries = []
    for _,l in df.iterrows():
        queries.append({"query":str(l["query"]),
                        "id":l["id"]})
    
    return queries

def text_to_tokenize_generator():
    print("MSMARCO queries")
    queries = []
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-doctrain-queries.tsv")))
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-docdev-queries.tsv")))
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-test2019-queries.tsv")))
    queries.extend(map(lambda x:x["query"],load_queries("msmarco-test2020-queries.tsv")))
    yield queries
    
    print("MSMARCO collection")
    yield collection_iterator(zipped_collection_file, lambda x:x["text"])


tk.fit_tokenizer_multiprocess(text_to_tokenize_generator())

tk.save_to_json()

In [5]:
queries

[{'query': '_________ justice is designed to repair the harm to victim, the community and the offender caused by the offender criminal act. question 19 options:',
  'id': 1185868},
 {'query': 'elegxo meaning', 'id': 1183785},
 {'query': 'what does physical medicine do', 'id': 645590},
 {'query': 'feeding rice cereal how many times per day', 'id': 186154},
 {'query': 'most dependable affordable cars', 'id': 457407},
 {'query': 'lithophile definition', 'id': 441383},
 {'query': 'what is a flail chest', 'id': 683408},
 {'query': 'put yourself on child support in texas', 'id': 484187},
 {'query': 'what happens in a wrist sprain', 'id': 666321},
 {'query': 'what are rhetorical topics', 'id': 564233},
 {'query': 'what is considered early fall', 'id': 733739},
 {'query': 'what causes elevated nitrate levels in aquariums', 'id': 1164798},
 {'query': 'lyme disease symptoms mood', 'id': 443797},
 {'query': 'what forms the epineurium', 'id': 662502},
 {'query': 'an alpha helix is an example of wh