## Install and imports

### install

In [None]:
!pip install bitsandbytes==0.43.1
!pip install transformers==4.40.2
!pip install peft==0.11.1
!pip install accelerate==0.30.1

!pip install transformers
!pip install pytrec_eval

Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.1
Collecting transformers==4.40.2
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.2)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.40.2-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m:00:

### import

In [None]:
from shutil import copyfile
copyfile(src = "/kaggle/input/msmarco/2019qrels-pass.txt", dst = "../working/2019qrels-pass.txt")
copyfile(src = "/kaggle/input/msmarco/msmarco-passagetest2019-top1000.tsv", dst = "../working/msmarco-passagetest2019-top1000.tsv")
copyfile(src = "/kaggle/input/msmarco/msmarco-test2019-queries.tsv", dst = "../working/msmarco-test2019-queries.tsv")

'../working/msmarco-test2019-queries.tsv'

In [None]:
import os
import json
import tqdm
import sys

## Load model

In [None]:
from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForCausalLM

model_name = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, padding=True, padding_side="left", maximum_length = 2048, model_max_length = 2048)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit = True, device_map = 'auto')
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = model.generation_config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

## Query expansion

In [None]:
import gzip
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BertForSequenceClassification
from collections import defaultdict
import numpy as np

def load_queries(filename):
#     with gzip.open(filename, 'rt', encoding='utf-8') as f:
    return pd.read_csv(filename, sep='\t', names=['query_id', 'query'])

def load_docs(filename):
#     with gzip.open(filename, 'rt', encoding='utf-8') as f:
    return pd.read_csv(filename, sep='\t', names=['query_id', 'doc_id', 'doc'])

queries = load_queries('/kaggle/working/msmarco-test2019-queries.tsv')
docs = load_docs('/kaggle/working/msmarco-passagetest2019-top1000.tsv')

def load_qrels(filename):
    qrels = defaultdict(dict)
    with open(filename, 'r') as f:
        for line in f:
            qid, _, did, rel = line.split()
            qrels[qid][did] = int(rel)
    return qrels

qrels = load_qrels('/kaggle/working/2019qrels-pass.txt')

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

def generate_expanded_queries(queries):
    expanded_queries = {}
    for _, row in queries.iterrows():
        query_text = row['query']
        query_id = str(row['query_id'])
        
        input_text = f"Answer the following query:\n{query_text}\n\nGive the rationale before answering"
        print("input text:", input_text)
        
        expanded_query = generator(input_text, max_length=512, num_return_sequences=1)
        generated_text = expanded_query[0]['generated_text'].strip()
        
        generated_text = generated_text.replace(input_text, "").strip()

        concatenated_query = f"{query_text} {generated_text}"
        
        print("expanded query:", concatenated_query)
        expanded_queries[query_id] = concatenated_query
    return expanded_queries

expanded_queries = generate_expanded_queries(queries)

input text: Answer the following query:
what slows down the flow of blood

Give the rationale before answering
expanded query: what slows down the flow of blood the question.

The correct answer is:
A clot (thrombus) in the vessel.

A clot (thrombus) in the vessel slows down the flow of blood because it obstructs the lumen of the vessel, thereby reducing the cross-sectional area available for blood flow. This leads to an increase in the resistance to blood flow, which in turn causes a decrease in the velocity of blood flow. The clot also activates the coagulation cascade, leading to the formation of fibrin, which further increases the resistance to blood flow. The slowing down of blood flow can also lead to the formation of collateral vessels, which help to bypass the obstructed vessel and restore blood flow to the affected area.
input text: Answer the following query:
what is the county for grand rapids, mn

Give the rationale before answering
expanded query: what is the county for gr

In [None]:
expanded_queries

{'1108939': 'what slows down the flow of blood the question.\n\nThe correct answer is:\nA clot (thrombus) in the vessel.\n\nA clot (thrombus) in the vessel slows down the flow of blood because it obstructs the lumen of the vessel, thereby reducing the cross-sectional area available for blood flow. This leads to an increase in the resistance to blood flow, which in turn causes a decrease in the velocity of blood flow. The clot also activates the coagulation cascade, leading to the formation of fibrin, which further increases the resistance to blood flow. The slowing down of blood flow can also lead to the formation of collateral vessels, which help to bypass the obstructed vessel and restore blood flow to the affected area.',
 '1112389': 'what is the county for grand rapids, mn :\nTo answer this query, we need to know which city is being referred to. In this case, "grand rapids" could refer to Grand Rapids, Minnesota, or Grand Rapids, Michigan. Since the query doesn\'t provide any addit

In [None]:
import pickle
from transformers import AutoTokenizer
def truncate_queries(queries, tokenizer, max_length=512):
    truncated_queries = {}
    for qid, query in queries.items():
        tokens = tokenizer.encode(query, truncation=True, max_length=max_length)
        truncated_query = tokenizer.decode(tokens, skip_special_tokens=True)
        truncated_queries[str(qid)] = truncated_query
    return truncated_queries

sim_model_name = 'bert-base-uncased'
sim_tokenizer = AutoTokenizer.from_pretrained(sim_model_name)

truncated_expanded_queries = truncate_queries(expanded_queries, sim_tokenizer, max_length=512)

with open('truncated_expanded_queries.pkl', 'wb') as f:
    pickle.dump(truncated_expanded_queries, f)

print("Truncated expanded queries have been saved to truncated_expanded_queries.pkl")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Truncated expanded queries have been saved to truncated_expanded_queries.pkl
Loaded truncated expanded queries from truncated_expanded_queries.pkl


## Installation

In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m688.8 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transfo

In [2]:
!pip install pytrec_eval

Collecting pytrec_eval
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec_eval
  Building wheel for pytrec_eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec_eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308205 sha256=c757041abed2bfb1d4bd6ae54926f09e6d13cb1cfb914b341962ab32869b3745
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec_eval
Installing collected packages: pytrec_eval
Successfully installed pytrec_eval-0.5


### Imports

In [3]:
"""
This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).

The query and the passage are passed simoultanously to a Transformer network. The network then returns
a score between 0 and 1 how relevant the passage is for a given query.

The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
for a given query, for example with ElasticSearch, and pass the query+retrieved_passage to the CrossEncoder
for scoring. You sort the results then according to the output of the CrossEncoder.

This gives a significant boost compared to out-of-the-box ElasticSearch / BM25 ranking.
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
import logging
from collections import defaultdict
import numpy as np
import sys
import pytrec_eval
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S')

## Evaluating preparation

### Initialize hyperparameters (e.g., batch size, etc)

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')
base_path = "./gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/"

Mounted at /content/gdrive


In [6]:
!mkdir -p $base_path

## Evaluate the model


### Load the fine-tuned model that you trained using the previous notebook. You need to set the path of your own fine-tuned model here.

In [15]:
model_save_path = "/content/gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/finetuned_models/cross-encoder-cross-encoder-ms-marco-MiniLM-L-2-v2-2024-05-10_20-46-58" #@param {type:"string"}

### Load data (For evaluation on TREC DL'19)

In [8]:
!wget https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
!tar -xvzf  queries.tar.gz

--2024-05-26 12:19:12--  https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
Resolving msmarco.z22.web.core.windows.net (msmarco.z22.web.core.windows.net)... 20.150.34.1
Connecting to msmarco.z22.web.core.windows.net (msmarco.z22.web.core.windows.net)|20.150.34.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18882551 (18M) [application/gzip]
Saving to: ‘queries.tar.gz’


2024-05-26 12:19:14 (16.1 MB/s) - ‘queries.tar.gz’ saved [18882551/18882551]

queries.dev.tsv
queries.eval.tsv
queries.train.tsv


In [19]:
import pickle
"""
This file evaluates CrossEncoder on the TREC 2019 Deep Learning (DL) Track: https://arxiv.org/abs/2003.07820

TREC 2019 DL is based on the corpus of MS Marco. MS Marco provides a sparse annotation, i.e., usually only a single
passage is marked as relevant for a given query. Many other highly relevant passages are not annotated and hence are treated
as an error if a model ranks those high.

TREC DL instead annotated up to 200 passages per query for their relevance to a given query. It is better suited to estimate
the model performance for the task of reranking in Information Retrieval.

Run:
python eval_cross-encoder-trec-dl.py cross-encoder-model-name

"""


data_folder = 'trec2019-data'
os.makedirs(data_folder, exist_ok=True)

#Read test queries
# queries = {}
# queries_filepath = os.path.join(data_folder, 'msmarco-test2019-queries.tsv.gz')
# if not os.path.exists(queries_filepath):
#     logging.info("Download "+os.path.basename(queries_filepath))
#     util.http_get('https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz', queries_filepath)

# with gzip.open(queries_filepath, 'rt', encoding='utf8') as fIn:
#     for line in fIn:
#         qid, query = line.strip().split("\t")
#         queries[qid] = query
with open('truncated_expanded_queries.pkl', 'rb') as f:
    queries = pickle.load(f)


#Read which passages are relevant
relevant_docs = defaultdict(lambda: defaultdict(int))
qrels_filepath = os.path.join(data_folder, '2019qrels-pass.txt')

if not os.path.exists(qrels_filepath):
    logging.info("Download "+os.path.basename(qrels_filepath))
    util.http_get('https://trec.nist.gov/data/deep/2019qrels-pass.txt', qrels_filepath)


with open(qrels_filepath) as fIn:
    for line in fIn:
        qid, _, pid, score = line.strip().split()
        score = int(score)
        if score > 0:
            relevant_docs[qid][pid] = score

# Only use queries that have at least one relevant passage
relevant_qid = []
for qid in queries:
    if len(relevant_docs[qid]) > 0:
        relevant_qid.append(qid)


# Read the top 1000 passages that are supposed to be re-ranked
passage_filepath = os.path.join(data_folder, 'msmarco-passagetest2019-top1000.tsv.gz')

if not os.path.exists(passage_filepath):
    logging.info("Download "+os.path.basename(passage_filepath))
    util.http_get('https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz', passage_filepath)



passage_cand = {}
with gzip.open(passage_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        qid, pid, query, passage = line.strip().split("\t")
        if qid not in passage_cand:
            passage_cand[qid] = []

        passage_cand[qid].append([pid, passage])

logging.info("Queries: {}".format(len(queries)))


INFO:root:Queries: 200


## Prediction

### Mini

In [20]:
queries_result_list = []
run = {}
model = CrossEncoder(model_save_path, max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cuda
  0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  2%|▏         | 1/43 [00:01<01:00,  1.44s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  5%|▍         | 2/43 [00:03<01:15,  1.84s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  7%|▋         | 3/43 [00:05<01:07,  1.68s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  9%|▉         | 4/43 [00:07<01:14,  1.90s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 12%|█▏        | 5/43 [00:09<01:10,  1.85s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 14%|█▍        | 6/43 [00:10<00:59,  1.60s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 16%|█▋        | 7/43 [00:11<00:50,  1.40s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 19%|█▊        | 8/43 [00:12<00:43,  1.25s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 21%|██        | 9/43 [00:13<00:49,  1.46s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 23%|██▎       | 10/43 [00:14<00:42,  1.29s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 26%|██▌       | 11/43 [00:16<00:46,  1.45s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 28%|██▊       | 12/43 [00:18<00:44,  1.43s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 30%|███       | 13/43 [00:19<00:42,  1.41s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 33%|███▎      | 14/43 [00:20<00:41,  1.44s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 35%|███▍      | 15/43 [00:22<00:37,  1.35s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 37%|███▋      | 16/43 [00:23<00:36,  1.36s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 40%|███▉      | 17/43 [00:24<00:34,  1.32s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 42%|████▏     | 18/43 [00:26<00:36,  1.47s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 44%|████▍     | 19/43 [00:27<00:30,  1.27s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 47%|████▋     | 20/43 [00:29<00:33,  1.44s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 49%|████▉     | 21/43 [00:30<00:28,  1.30s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 51%|█████     | 22/43 [00:31<00:26,  1.27s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 53%|█████▎    | 23/43 [00:32<00:26,  1.31s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 56%|█████▌    | 24/43 [00:36<00:37,  1.98s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 58%|█████▊    | 25/43 [00:38<00:36,  2.01s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 60%|██████    | 26/43 [00:40<00:33,  1.97s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 63%|██████▎   | 27/43 [00:43<00:39,  2.45s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 65%|██████▌   | 28/43 [00:47<00:41,  2.74s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 67%|██████▋   | 29/43 [00:49<00:37,  2.66s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 70%|██████▉   | 30/43 [00:52<00:34,  2.66s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 72%|███████▏  | 31/43 [00:53<00:26,  2.23s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 74%|███████▍  | 32/43 [00:54<00:20,  1.85s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 79%|███████▉  | 34/43 [00:56<00:13,  1.50s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 84%|████████▎ | 36/43 [00:57<00:07,  1.08s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 86%|████████▌ | 37/43 [00:58<00:06,  1.04s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 88%|████████▊ | 38/43 [00:59<00:05,  1.03s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 91%|█████████ | 39/43 [01:00<00:03,  1.02it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 93%|█████████▎| 40/43 [01:01<00:03,  1.14s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 95%|█████████▌| 41/43 [01:03<00:02,  1.17s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 98%|█████████▊| 42/43 [01:04<00:01,  1.32s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 43/43 [01:06<00:00,  1.55s/it]


### Distil

In [13]:
queries_result_list = []
run = {}
model = CrossEncoder(model_save_path, max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cuda
  0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  2%|▏         | 1/43 [00:10<07:38, 10.92s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  5%|▍         | 2/43 [00:25<08:57, 13.11s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  7%|▋         | 3/43 [00:33<07:13, 10.84s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  9%|▉         | 4/43 [00:45<07:19, 11.26s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 12%|█▏        | 5/43 [00:57<07:19, 11.57s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 14%|█▍        | 6/43 [01:06<06:28, 10.49s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 16%|█▋        | 7/43 [01:14<05:49,  9.71s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 19%|█▊        | 8/43 [01:22<05:18,  9.10s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 21%|██        | 9/43 [01:38<06:27, 11.39s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 23%|██▎       | 10/43 [01:46<05:46, 10.50s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 26%|██▌       | 11/43 [02:03<06:36, 12.40s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 28%|██▊       | 12/43 [02:12<05:50, 11.31s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 30%|███       | 13/43 [02:19<05:03, 10.13s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 33%|███▎      | 14/43 [02:28<04:38,  9.61s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 35%|███▍      | 15/43 [02:36<04:21,  9.33s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 37%|███▋      | 16/43 [02:50<04:43, 10.51s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 40%|███▉      | 17/43 [03:01<04:36, 10.64s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 42%|████▏     | 18/43 [03:19<05:20, 12.83s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 44%|████▍     | 19/43 [03:25<04:24, 11.02s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 47%|████▋     | 20/43 [03:44<05:02, 13.14s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 49%|████▉     | 21/43 [03:52<04:21, 11.88s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 51%|█████     | 22/43 [04:04<04:06, 11.73s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 53%|█████▎    | 23/43 [04:12<03:32, 10.60s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 56%|█████▌    | 24/43 [04:24<03:33, 11.23s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 58%|█████▊    | 25/43 [04:33<03:05, 10.32s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 60%|██████    | 26/43 [04:43<02:54, 10.27s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 63%|██████▎   | 27/43 [05:01<03:24, 12.77s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 65%|██████▌   | 28/43 [05:20<03:38, 14.55s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 67%|██████▋   | 29/43 [05:30<03:03, 13.13s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 70%|██████▉   | 30/43 [05:39<02:35, 11.99s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 72%|███████▏  | 31/43 [05:49<02:16, 11.35s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 74%|███████▍  | 32/43 [05:57<01:54, 10.44s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 77%|███████▋  | 33/43 [05:58<01:14,  7.42s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 79%|███████▉  | 34/43 [06:17<01:37, 10.83s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 84%|████████▎ | 36/43 [06:24<00:52,  7.55s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 86%|████████▌ | 37/43 [06:32<00:45,  7.65s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 88%|████████▊ | 38/43 [06:41<00:39,  7.94s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 91%|█████████ | 39/43 [06:49<00:32,  8.05s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 93%|█████████▎| 40/43 [07:05<00:30, 10.20s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 95%|█████████▌| 41/43 [07:13<00:19,  9.64s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 98%|█████████▊| 42/43 [07:24<00:09,  9.95s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 43/43 [07:38<00:00, 10.65s/it]


### Tiny

In [None]:
queries_result_list = []
run = {}
model = CrossEncoder(model_save_path, max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

INFO:sentence_transformers.cross_encoder.CrossEncoder:Use pytorch device: cpu
  0%|          | 0/43 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  2%|▏         | 1/43 [00:15<10:54, 15.59s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  5%|▍         | 2/43 [00:43<15:34, 22.78s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  7%|▋         | 3/43 [00:55<11:50, 17.77s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

  9%|▉         | 4/43 [01:14<11:54, 18.33s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 12%|█▏        | 5/43 [01:41<13:30, 21.32s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 14%|█▍        | 6/43 [01:53<11:18, 18.34s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 16%|█▋        | 7/43 [02:04<09:34, 15.96s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 19%|█▊        | 8/43 [02:15<08:27, 14.49s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 21%|██        | 9/43 [02:45<10:55, 19.27s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 23%|██▎       | 10/43 [02:57<09:14, 16.80s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 26%|██▌       | 11/43 [03:22<10:26, 19.59s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 28%|██▊       | 12/43 [03:35<08:56, 17.32s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 30%|███       | 13/43 [03:45<07:37, 15.27s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 33%|███▎      | 14/43 [03:55<06:37, 13.69s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 35%|███▍      | 15/43 [04:07<06:04, 13.04s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 37%|███▋      | 16/43 [04:26<06:41, 14.88s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 40%|███▉      | 17/43 [04:41<06:30, 15.01s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 42%|████▏     | 18/43 [05:09<07:53, 18.95s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 44%|████▍     | 19/43 [05:19<06:24, 16.04s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 47%|████▋     | 20/43 [05:51<08:00, 20.87s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 49%|████▉     | 21/43 [06:02<06:35, 17.98s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 51%|█████     | 22/43 [06:19<06:12, 17.72s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 53%|█████▎    | 23/43 [06:28<05:04, 15.22s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 56%|█████▌    | 24/43 [06:46<05:05, 16.08s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 58%|█████▊    | 25/43 [06:58<04:22, 14.56s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 60%|██████    | 26/43 [07:11<04:00, 14.15s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 63%|██████▎   | 27/43 [07:40<05:00, 18.78s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 65%|██████▌   | 28/43 [08:10<05:31, 22.09s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 67%|██████▋   | 29/43 [08:24<04:33, 19.56s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 70%|██████▉   | 30/43 [08:33<03:35, 16.57s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 72%|███████▏  | 31/43 [08:48<03:11, 15.96s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 74%|███████▍  | 32/43 [08:59<02:40, 14.57s/it]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

 77%|███████▋  | 33/43 [09:00<01:43, 10.34s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 79%|███████▉  | 34/43 [09:29<02:25, 16.14s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 81%|████████▏ | 35/43 [09:29<01:30, 11.33s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 84%|████████▎ | 36/43 [09:40<01:17, 11.00s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 86%|████████▌ | 37/43 [09:49<01:02, 10.47s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 88%|████████▊ | 38/43 [10:01<00:55, 11.09s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 91%|█████████ | 39/43 [10:12<00:43, 10.91s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 93%|█████████▎| 40/43 [10:33<00:41, 13.96s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 95%|█████████▌| 41/43 [10:43<00:25, 12.83s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

 98%|█████████▊| 42/43 [10:57<00:12, 13.00s/it]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 43/43 [11:14<00:00, 15.69s/it]


## Evaluation

### Mini

In [21]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

Queries: 43
NDCG@10: 50.98
Recall@100: 43.58
MAP@1000: 36.93


### Distil

In [14]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

Queries: 43
NDCG@10: 11.61
Recall@100: 13.19
MAP@1000: 9.96


### Tiny

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

Queries: 43
NDCG@10: 52.23
Recall@100: 42.69
MAP@1000: 36.41


## Sorting candidate documents of each query based on their relevance score

In [None]:
import operator
for qid in run.keys():
  run[qid] = sorted(run[qid].items(), key=operator.itemgetter(1), reverse = True)

## Storing ranking run file

In [None]:
ranking_lines = []
for qid in run.keys():
  for rank, did_pred_score in enumerate(run[qid]):
    did, pred_score = did_pred_score
    line = "{qid} Q0 {did} {rank} {pred_score} STANDARD".format(qid=qid, did=did, rank=rank, pred_score=str(pred_score))
    ranking_lines.append(line)

In [None]:
ranking_run_file_path = model_save_path + "ranking.run"
f_w = open(ranking_run_file_path, "w+")
f_w.write("\n".join(ranking_lines))
f_w.close()

### Print the first three lines of the stored ranking run file

#### Mini

In [None]:
!head -n 3 $ranking_run_file_path

156493 Q0 95512 0 -0.40828004479408264 STANDARD
156493 Q0 2717163 1 -0.5530328154563904 STANDARD
156493 Q0 7735245 2 -0.7408422231674194 STANDARD


#### Distil

In [None]:
!head -n 3 $ranking_run_file_path

156493 Q0 4446860 0 0.8504350781440735 STANDARD
156493 Q0 5063611 1 0.8445623517036438 STANDARD
156493 Q0 6352009 2 0.8254523277282715 STANDARD


### Tiny

In [None]:
!head -n 3 $ranking_run_file_path

156493 Q0 2717163 0 0.3954322934150696 STANDARD
156493 Q0 8245442 1 0.0637449324131012 STANDARD
156493 Q0 6859026 2 -0.2420291304588318 STANDARD
