In [55]:
import pandas as pd
import gc
import json
import tarfile
import codecs
import sys 
import time

from collections import defaultdict
from mmnrm.text import TREC_goldstandard_transform, TREC_queries_transform

def load_TREC_queries(file):
    df = pd.read_csv(file, sep="\t")
    df.columns = ["id", "query"]
    topics = []
    for _,l in df.iterrows():
        topics.append({"query":str(l["query"]), "id":str(l["id"])})
        
    return TREC_queries_transform(topics, number_parameter="id", fn=lambda x:x["query"])

def load_TREC_qrels(q_rels_file):
    
    with open(q_rels_file) as f:
        goldstandard = defaultdict(list)

        for line in f:
            line = line.strip().split(" ")
            try:
                goldstandard[line[0]].append((line[2], line[3]))
            except :
                print(line)
            
    return TREC_goldstandard_transform(goldstandard)

def load_prerank(file_name, collection, top_k=100):
    prerank = defaultdict(list)
    min_rank = 999
    with open(file_name) as f:
        for line in f:
            elements = line.split(" ")
            if elements[2] in collection and len(prerank[elements[0]])<top_k:
                article = collection[elements[2]]
                prerank[elements[0]].append({"id":elements[2], 
                                              "score":elements[4],
                                              "text":article["text"],
                                              "title":article["title"]})
            else:
                min_rank = min(min_rank, int(elements[3]))
                print({"topic_id":elements[0],"id":elements[2], "score":elements[4], "rank":elements[3]})
    print(min_rank)    
    
    # create test collection base on the docs
    docs_per_topic = [len(docs_topic) for docs_topic in prerank.values()]
    print("average docs per topic", sum(docs_per_topic)/len(docs_per_topic), "min:",min(docs_per_topic),"max:",max(docs_per_topic))
    
    return prerank
def collection_iterator(file_name, f_map=None):
    return collection_iterator_fn(file_name=file_name, f_map=f_map)()

def collection_iterator_fn(file_name, f_map=None):
    
    reader = codecs.getreader("ascii")
    tar = tarfile.open(file_name)

    print("[CORPORA] Openning tar file", file_name)

    members = tar.getmembers()
    
    def generator():
        for m in members:
            print("[CORPORA] Openning tar file {}".format(m.name))
            f = tar.extractfile(m)
            articles = json.load(reader(f))
            if f_map is not None:
                articles = list(map(f_map, articles))
            yield articles
            f.close()
            del f
            gc.collect()
    return generator

In [10]:
collection_gen = collection_iterator("/backup/MS-MARCO/ms-marco-docs.tar.gz")

[CORPORA] Openning tar file /backup/MS-MARCO/ms-marco-docs.tar.gz


In [13]:
collection = sum([ articles for articles in collection_gen],[])

[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_0000000_to_0500000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_0500000_to_1000000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_1000000_to_1500000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_1500000_to_2000000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_2000000_to_2500000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_2500000_to_3000000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_3000000_to_3213834


In [16]:
collection = {x["id"]:x for x in collection}

In [21]:
pre_rank = load_prerank("/backup/MS-MARCO/msmarco-doctrain-top100", collection)

999
average docs per topic 99.99977112464987 min: 40 max: 100


In [29]:
# load queries
queries = load_TREC_queries("/backup/MS-MARCO/msmarco-doctrain-queries.tsv")
# read relevance
goldstandard = load_TREC_qrels("/backup/MS-MARCO/msmarco-doctrain-qrels.tsv")

In [66]:
from mmnrm.dataset import TestCollectionV2



t_collection = TestCollectionV2(queries, 
                                   goldstandard, 
                                   pre_rank, 
                                   use_relevance_groups=False)\
                            .batch_size(32)


In [70]:
next(t_collection.generator())

TypeError: unhashable type: 'slice'

In [75]:
pre_rank

In [49]:
q_ids = set([x["id"] for x in queries])

In [40]:
df = pd.read_csv("/backup/MS-MARCO/msmarco-doctrain-qrels.tsv", sep = "\t")

In [46]:
goldstandard["1185868"]

defaultdict(list, {1: ['D59235']})

In [50]:
q_ids

{'84775',
 '727871',
 '816913',
 '579676',
 '185561',
 '1146933',
 '1025869',
 '22037',
 '709661',
 '282673',
 '627874',
 '864295',
 '266832',
 '133949',
 '878191',
 '199438',
 '262972',
 '135164',
 '226470',
 '634639',
 '411294',
 '529955',
 '340486',
 '993968',
 '566409',
 '565032',
 '777640',
 '331715',
 '54687',
 '996266',
 '589908',
 '682074',
 '139018',
 '29210',
 '730927',
 '466909',
 '768626',
 '702943',
 '918515',
 '580402',
 '240883',
 '135833',
 '322454',
 '543030',
 '164794',
 '353158',
 '322639',
 '338792',
 '894596',
 '1138962',
 '328751',
 '856359',
 '716698',
 '638835',
 '782372',
 '1059658',
 '992490',
 '653105',
 '458817',
 '276885',
 '54122',
 '692407',
 '928123',
 '244954',
 '505077',
 '173223',
 '1017416',
 '849141',
 '701257',
 '563921',
 '921988',
 '531333',
 '41266',
 '123700',
 '502765',
 '70154',
 '1033431',
 '649720',
 '846814',
 '888071',
 '810347',
 '203941',
 '1173965',
 '624965',
 '112835',
 '605958',
 '80998',
 '534268',
 '119880',
 '606172',
 '758266',
