In [None]:
import random
import re
import pickle
import ir_datasets
from rerankers import Reranker
from rank_bm25 import BM25Okapi

In [4]:
random.seed(42)

def load_dataset(input_file):
    with open(input_file, 'rb') as f:
        return pickle.load(f)

data_set = "../subset_msmarco_train_0/subset_msmarco_train_0.01_99.pkl"

data = load_dataset(data_set)
queries = data["queries"]

# Split the queries (assuming queries is a dictionary of {query_id: query_object})
query_ids = list(queries.keys())  # List of query IDs

# Shuffle query IDs to ensure a random split
random.shuffle(query_ids)
# print(query_ids)

# Split into 80% for training, 20% for validation
split_ratio = 0.8
train_query_ids = query_ids[:int(len(query_ids) * split_ratio)]
test_query_ids = query_ids[int(len(query_ids) * split_ratio):]

train_queries = {qid: queries[qid] for qid in train_query_ids}
test_queries = {qid: queries[qid] for qid in test_query_ids}

In [32]:
train_query_ids

['577486',
 '894214',
 '83532',
 '1027860',
 '894315',
 '386975',
 '690224',
 '332565',
 '553697',
 '13745',
 '1037288',
 '985467',
 '413374',
 '228988',
 '850291',
 '1073337',
 '693706',
 '723155',
 '954237',
 '212615',
 '828505',
 '1158832',
 '451945',
 '575719',
 '845200',
 '805805',
 '407420',
 '290696',
 '59541',
 '354065',
 '158376',
 '6487',
 '965214',
 '567679',
 '909371',
 '273905',
 '135841',
 '779106',
 '120644',
 '190202',
 '774674',
 '524050',
 '1169672',
 '784395',
 '711237',
 '338471',
 '169856',
 '927144',
 '78004',
 '996046',
 '695084',
 '277917',
 '585590',
 '988331',
 '698641',
 '135503',
 '769734',
 '802614',
 '493342',
 '84077',
 '550872',
 '579224',
 '580476',
 '653415',
 '622608',
 '607005',
 '74804',
 '341459',
 '1162635',
 '196289',
 '523600',
 '739208',
 '141297',
 '559081',
 '334135',
 '763952',
 '1070168',
 '497885',
 '171040',
 '47474',
 '747861',
 '229242',
 '440388',
 '608577',
 '43948',
 '724525',
 '139322',
 '533137',
 '878655',
 '865186',
 '80310',
 '2

In [3]:
relevant_docs = dict()
for qrel in data['qrels']:
    relevant_docs[qrel.query_id] = relevant_docs.get(qrel.query_id, []) + [qrel.doc_id]

In [4]:
train_docs = set()
for qid in train_query_ids:
    train_docs.update(relevant_docs[qid])

In [26]:
rev_1 = 0
rev_other = 0
query_id_set = set()

for qrel in data["qrels"]:
    if qrel.query_id in query_id_set:
        print("Duplicate query id: ", qrel.query_id)
    query_id_set.add(qrel.query_id)
    if qrel.relevance == 1:
        rev_1 += 1
    else:
        rev_other += 1
        
print("Relevance 1: ", rev_1)
print("Relevance other: ", rev_other)

Duplicate query id:  1165821
Duplicate query id:  430794
Duplicate query id:  462410
Duplicate query id:  756015
Duplicate query id:  454618
Duplicate query id:  614608
Duplicate query id:  573516
Duplicate query id:  721983
Duplicate query id:  580005
Duplicate query id:  588915
Duplicate query id:  588101
Duplicate query id:  529620
Duplicate query id:  661867
Duplicate query id:  178957
Duplicate query id:  458592
Duplicate query id:  689201
Duplicate query id:  593559
Duplicate query id:  925312
Duplicate query id:  489925
Duplicate query id:  200602
Duplicate query id:  445444
Duplicate query id:  91618
Duplicate query id:  589031
Duplicate query id:  565738
Duplicate query id:  879782
Duplicate query id:  587468
Duplicate query id:  522699
Duplicate query id:  581966
Duplicate query id:  1185196
Duplicate query id:  592097
Duplicate query id:  678265
Duplicate query id:  214908
Duplicate query id:  875220
Duplicate query id:  1034247
Duplicate query id:  587886
Duplicate query id

In [10]:
for value in data["docs"].values():
    print(value.text)
    break

Summary. Factor VII deficiency is a rare genetic bleeding disorder characterized by a deficiency or reduced activity of clotting factor VII. Clotting factors are specialized proteins that are essential for the blood to clot normally. Individuals with factor VII deficiency can experience prolonged, uncontrolled bleeding episodes.


In [5]:
def calculateMRR(query, ranking):
    for i, doc in enumerate(ranking):
        if doc[0] in relevant_docs[query]:
            return 1/(i+1)
    return 0

In [6]:
def getRelevantDocTexts(query_id):
    relevant_doc_texts = []
    for doc_id in relevant_docs[query_id]:
        relevant_doc_texts.append(data["docs"][doc_id].text)
    return relevant_doc_texts

In [7]:
def preprocess(text):
    return re.sub(r"[^\w\s]", "", text).lower().split()

In [8]:
tokenized_corpus = [preprocess(doc.text) for doc in data["docs"].values()]
bm25 = BM25Okapi(tokenized_corpus)

In [12]:
query_id = train_query_ids[3]
query = queries[query_id].text
tokenized_query = query.lower().split()
doc_scores = bm25.get_scores(tokenized_query)
doc_ranking = sorted(zip(data["docs"].keys(), doc_scores), key=lambda x: x[1], reverse=True)
print(calculateMRR(query_id, doc_ranking))

1.0


In [58]:
print(query)
print(getRelevantDocTexts(query_id))

who founded dynon avionics
['Some estimates peg market share at an astounding 70%. Dynon Avionics was founded in 2000 by John Torode, a pilot who keeps his Murphy Moose and de Havilland Beaver docked in front of his home on Lake Washington, not far from the University of Washington campus where he earned a Ph.D. in computer science. Founder John Torode working on his Moose.']


In [9]:
def baseline_model(query_id):
    query = queries[query_id].text
    tokenized_query = query.lower().split()
    doc_scores = bm25.get_scores(tokenized_query)
    doc_ranking = sorted(zip(data["docs"].keys(), doc_scores), key=lambda x: x[1], reverse=True)
    return doc_ranking

In [None]:
average_mrr = 0
for query_id in test_query_ids:
    doc_ranking = baseline_model(query_id)
    average_mrr += calculateMRR(query_id, doc_ranking)
average_mrr /= len(test_query_ids)
print(average_mrr)

0.4742921013594222


In [24]:
ranker = Reranker('cross-encoder', device='cuda')
def model(query_id):
    query = queries[query_id].text
    tokenized_query = query.lower().split()
    doc_scores = bm25.get_scores(tokenized_query)
    doc_ranking = sorted(zip(data["docs"].keys(), doc_scores), key=lambda x: x[1], reverse=True)
    top_10 = doc_ranking[:10]
    top_10_ids = [doc_id for doc_id, score in top_10]
    top_10_texts = [data["docs"][doc_id].text for doc_id in top_10_ids]
    reranked = ranker.rank(query=query, docs=top_10_texts, doc_ids=top_10_ids)
    doc_ids = [result.doc_id for result in reranked]
    scores = [result.score for result in reranked]
    doc_ranking = list(zip(doc_ids, scores))
    return doc_ranking

Loading default cross-encoder model for language en
Default Model: mixedbread-ai/mxbai-rerank-base-v1
Loading TransformerRanker model mixedbread-ai/mxbai-rerank-base-v1 (this message can be suppressed by setting verbose=0)
No dtype set
Using dtype torch.float32
Loaded model mixedbread-ai/mxbai-rerank-base-v1
Using device cuda.
Using dtype torch.float32.


In [23]:
print(len(test_query_ids))

555


In [25]:
rank2 = model(query_id)
print(rank2)

[('msmarco_passage_23_236994252', 4.24080753326416), ('msmarco_passage_04_275656991', -4.4532623291015625), ('msmarco_passage_03_635028611', -4.933216094970703), ('msmarco_passage_03_181922923', -5.235662460327148), ('msmarco_passage_04_608482928', -5.652072906494141), ('msmarco_passage_02_6983551', -5.735648155212402), ('msmarco_passage_04_228105861', -6.542544364929199), ('msmarco_passage_01_261973346', -6.725177764892578), ('msmarco_passage_04_204127061', -6.737245559692383), ('msmarco_passage_02_737081483', -6.9050703048706055)]


In [20]:
rank2.results[0].score

4.24080753326416

In [14]:
print(query)

who founded dynon avionics


In [26]:
average_mrr2 = 0
for query_id in test_query_ids:
    doc_ranking = model(query_id)
    average_mrr2 += calculateMRR(query_id, doc_ranking)
average_mrr2 /= len(test_query_ids)
print(average_mrr2)

0.5828528528528527
