In [1]:
import pandas as pd
import gc
import json
import tarfile
import codecs
import sys 
import time

from collections import defaultdict
from mmnrm.text import TREC_goldstandard_transform, TREC_queries_transform
from mmnrm.evaluation import TREC_Evaluator
from mmnrm.dataset import TrainCollectionV2, TestCollectionV2

def load_TREC_queries(file):
    df = pd.read_csv(file, sep="\t")
    df.columns = ["id", "query"]
    topics = []
    for _,l in df.iterrows():
        topics.append({"query":str(l["query"]), "id":str(l["id"])})
        
    return TREC_queries_transform(topics, number_parameter="id", fn=lambda x:x["query"])

def load_TREC_qrels(q_rels_file):
    
    with open(q_rels_file) as f:
        goldstandard = defaultdict(list)

        for line in f:
            line = line.strip().split(" ")
            try:
                goldstandard[line[0]].append((line[2], line[3]))
            except :
                print(line)
            
    return TREC_goldstandard_transform(goldstandard)

def load_prerank(file_name, collection, top_k=100):
    prerank = defaultdict(list)
    min_rank = 999
    with open(file_name) as f:
        for line in f:
            elements = line.split(" ")
            if elements[2] in collection and len(prerank[elements[0]])<top_k:
                article = collection[elements[2]]
                prerank[elements[0]].append({"id":elements[2], 
                                              "score":elements[4],
                                              "text":article["text"],
                                              "title":article["title"]})
            
    print(min_rank)    
    
    # create test collection base on the docs
    docs_per_topic = [len(docs_topic) for docs_topic in prerank.values()]
    print("average docs per topic", sum(docs_per_topic)/len(docs_per_topic), "min:",min(docs_per_topic),"max:",max(docs_per_topic))
    
    return prerank
def collection_iterator(file_name, f_map=None):
    return collection_iterator_fn(file_name=file_name, f_map=f_map)()

def collection_iterator_fn(file_name, f_map=None):
    
    reader = codecs.getreader("ascii")
    tar = tarfile.open(file_name)

    print("[CORPORA] Openning tar file", file_name)

    members = tar.getmembers()
    
    def generator():
        for m in members:
            print("[CORPORA] Openning tar file {}".format(m.name))
            f = tar.extractfile(m)
            articles = json.load(reader(f))
            if f_map is not None:
                articles = list(map(f_map, articles))
            yield articles
            f.close()
            del f
            gc.collect()
    return generator

In [2]:
collection_gen = collection_iterator("/backup/MS-MARCO/ms-marco-docs.tar.gz")

[CORPORA] Openning tar file /backup/MS-MARCO/ms-marco-docs.tar.gz


In [3]:
collection = sum([ articles for articles in collection_gen],[])

[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_0000000_to_0500000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_0500000_to_1000000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_1000000_to_1500000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_1500000_to_2000000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_2000000_to_2500000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_2500000_to_3000000
[CORPORA] Openning tar file tmp/tmpvmkbfob_/ms-marco-docs_3000000_to_3213834


In [4]:
collection = {x["id"]:x for x in collection}

In [5]:
pre_rank = load_prerank("/backup/MS-MARCO/msmarco-doctrain-top100", collection, top_k=100)

999
average docs per topic 99.99977112464987 min: 40 max: 100


In [6]:
# load queries
queries = load_TREC_queries("/backup/MS-MARCO/msmarco-doctrain-queries.tsv")
# read relevance
goldstandard = load_TREC_qrels("/backup/MS-MARCO/msmarco-doctrain-qrels.tsv")

In [7]:



t_collection = TrainCollectionV2(queries, 
                                   goldstandard, 
                                   pre_rank, 
                                   use_relevance_groups=False)\
                            .batch_size(32)


Minimum number of relevance type(0) in the queries of the goldstandard sub set: 39
Mean number of relevance type(0) in the queries of the goldstandard sub set: 98.99969024721223
Minimum number of relevance type(1) in the queries of the goldstandard sub set: 1
Mean number of relevance type(1) in the queries of the goldstandard sub set: 1.0
Sub Collection size 2706193
Number of skipped question, due to lack of true positives 95828


In [85]:
next(t_collection.generator())

(array(['notice of working hours', 'what is the analyte in a titration',
        'where is hostess headquarters',
        "what's the cure for low blood pressure",
        'where is evine broadcast from',
        'how cold is new york city in march',
        'what is a substation inside a building', 'what is prostacyclin',
        'foods that are a natural antibiotic', 'when are chipmunks active',
        'what are the main ingredients for arepas',
        'what does a thermistor do', 'is get an intransitive verb',
        'definition of marketing cooperative', 'what months  is spring',
        'is singapore a country', 'who owns formula drift',
        'what is a clavichord?', 'what is webrtc', 'define counteract',
        'how many lbs is a stone', 'what is a computer motherboard',
        'does sugar pear help with cough',
        'who developed the theoretical basis for quantum physics? quizlet',
        'horizon zero dawn character creation',
        'is panda express in canada',


In [28]:
#HOT FIX!!!
black_list_ids = {'502557'}
index_to_removed = []
for i,x in enumerate(t_collection.query_list):
    if x["id"] in black_list_ids:
        index_to_removed.append(i)

for j in range(len(index_to_removed), 0, -1):
    del t_collection.query_list[index_to_removed[j-1]]

#t_collection.save("/backup/MS-MARCO/preprocess/train_collection_k100")

In [31]:
t_collection.save("/backup/MS-MARCO/preprocess/train_collection_k100")

In [None]:
t_collection = TrainCollectionV2.load("/backup/MS-MARCO/preprocess/train_collection_k100")

In [3]:
t_collection.sub_set_goldstandard['502557']

KeyError: '502557'

In [27]:
t_collection.query_list[index_to_removed[0]]

{'id': '502557', 'query': 'standingdefinition'}

In [29]:
q_ids = { q["id"] for q in t_collection.query_list}

not_found = [ k for k in q_ids if k not in t_collection.sub_set_goldstandard]

In [30]:
not_found

[]

# BUILD dev dataset



In [5]:
# load queries
queries = load_TREC_queries("/backup/MS-MARCO/msmarco-docdev-queries.tsv")
# read relevance
#goldstandard = load_TREC_qrels("/backup/MS-MARCO/msmarco-docdev-qrels.tsv")

pre_rank = load_prerank("/backup/MS-MARCO/msmarco-docdev-top100", collection, top_k=100)

999
average docs per topic 100.0 min: 100 max: 100


In [9]:
trec_evaluator = TREC_Evaluator("/backup/MS-MARCO/msmarco-docdev-qrels.tsv", '/backup/MS-MARCO/trec_eval-9.0.7/trec_eval')

t_collection = TestCollectionV2(queries, pre_rank, trec_evaluator)

In [13]:
print(t_collection.evaluate_pre_rerank())
t_collection.save("/backup/MS-MARCO/preprocess/dev_collection_k100")

Remove /tmp/tmpew4fwyg7


# BUILD test2019 dataset

In [16]:
# load queries
queries = load_TREC_queries("/backup/MS-MARCO/msmarco-test2019-queries.tsv")
# read relevance
#goldstandard = load_TREC_qrels("/backup/MS-MARCO/msmarco-docdev-qrels.tsv")

pre_rank = load_prerank("/backup/MS-MARCO/msmarco-doctest2019-top100", collection, top_k=100)


999
average docs per topic 100.0 min: 100 max: 100


In [17]:
trec_evaluator = TREC_Evaluator("/backup/MS-MARCO/2019qrels-docs.txt", '/backup/MS-MARCO/trec_eval-9.0.7/trec_eval')

t_collection = TestCollectionV2(queries, pre_rank, trec_evaluator)

In [18]:
print(t_collection.evaluate_pre_rerank())
t_collection.save("/backup/MS-MARCO/preprocess/val2019_collection_k100")

Remove /tmp/tmpjm92llm_
{'recall_100': 0.3871, 'map_cut_20': 0.1449, 'ndcg_cut_20': 0.4868, 'P_20': 0.4953}


In [2]:
# pre evaluate

dev = TestCollectionV2.load("/backup/MS-MARCO/preprocess/dev_collection_k100")
test2019 = TestCollectionV2.load("/backup/MS-MARCO/preprocess/val2019_collection_k100")

In [5]:
print(dev.evaluate_pre_rerank( output_metris=["recall_100", "ndcg_cut_10", "P_5"]))
print(test2019.evaluate_pre_rerank( output_metris=["recall_100", "ndcg_cut_10", "P_5"]))

Remove /tmp/tmp2zswu2ym
{'recall_100': 0.7564, 'ndcg_cut_10': 0.2627, 'P_5': 0.066}
Remove /tmp/tmpfmftmnm8
{'recall_100': 0.3871, 'ndcg_cut_10': 0.5164, 'P_5': 0.6605}
