## Install and imports

### install

In [None]:
!pip install bitsandbytes==0.43.1
!pip install transformers==4.40.2
!pip install peft==0.11.1
!pip install accelerate==0.30.1

!pip install transformers
!pip install pytrec_eval

### import

In [None]:
from shutil import copyfile
copyfile(src = "/kaggle/input/msmarco/2019qrels-pass.txt", dst = "../working/2019qrels-pass.txt")
copyfile(src = "/kaggle/input/msmarco/msmarco-passagetest2019-top1000.tsv", dst = "../working/msmarco-passagetest2019-top1000.tsv")
copyfile(src = "/kaggle/input/msmarco/msmarco-test2019-queries.tsv", dst = "../working/msmarco-test2019-queries.tsv")

In [None]:
import os
import json
import tqdm
import sys

## Load model

In [None]:
from transformers import AutoTokenizer, LlamaForCausalLM, AutoModelForCausalLM

model_name = "HuggingFaceH4/zephyr-7b-beta"
tokenizer = AutoTokenizer.from_pretrained(model_name, truncation=True, padding=True, padding_side="left", maximum_length = 2048, model_max_length = 2048)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit = True, device_map = 'auto')
tokenizer.pad_token = tokenizer.eos_token
model.generation_config.pad_token_id = model.generation_config.eos_token_id

## Query expansion

In [None]:
import gzip
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BertForSequenceClassification
from collections import defaultdict
import numpy as np

def load_queries(filename):
#     with gzip.open(filename, 'rt', encoding='utf-8') as f:
    return pd.read_csv(filename, sep='\t', names=['query_id', 'query'])

def load_docs(filename):
#     with gzip.open(filename, 'rt', encoding='utf-8') as f:
    return pd.read_csv(filename, sep='\t', names=['query_id', 'doc_id', 'doc'])

queries = load_queries('/kaggle/working/msmarco-test2019-queries.tsv')
docs = load_docs('/kaggle/working/msmarco-passagetest2019-top1000.tsv')

def load_qrels(filename):
    qrels = defaultdict(dict)
    with open(filename, 'r') as f:
        for line in f:
            qid, _, did, rel = line.split()
            qrels[qid][did] = int(rel)
    return qrels

qrels = load_qrels('/kaggle/working/2019qrels-pass.txt')

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

def generate_expanded_queries(queries):
    expanded_queries = {}
    for _, row in queries.iterrows():
        query_text = row['query']
        query_id = str(row['query_id'])
        
        input_text = f"Answer the following query:\n{query_text}\n\nGive the rationale before answering"
        print("input text:", input_text)
        
        expanded_query = generator(input_text, max_length=512, num_return_sequences=1)
        generated_text = expanded_query[0]['generated_text'].strip()
        
        generated_text = generated_text.replace(input_text, "").strip()

        concatenated_query = f"{query_text} {generated_text}"
        
        print("expanded query:", concatenated_query)
        expanded_queries[query_id] = concatenated_query
    return expanded_queries

expanded_queries = generate_expanded_queries(queries)

In [None]:
expanded_queries

In [None]:
import pickle
from transformers import AutoTokenizer
def truncate_queries(queries, tokenizer, max_length=512):
    truncated_queries = {}
    for qid, query in queries.items():
        tokens = tokenizer.encode(query, truncation=True, max_length=max_length)
        truncated_query = tokenizer.decode(tokens, skip_special_tokens=True)
        truncated_queries[str(qid)] = truncated_query
    return truncated_queries

sim_model_name = 'bert-base-uncased'
sim_tokenizer = AutoTokenizer.from_pretrained(sim_model_name)

truncated_expanded_queries = truncate_queries(expanded_queries, sim_tokenizer, max_length=512)

with open('truncated_expanded_queries.pkl', 'wb') as f:
    pickle.dump(truncated_expanded_queries, f)

print("Truncated expanded queries have been saved to truncated_expanded_queries.pkl")



## Installation

In [None]:
!pip install sentence-transformers

In [None]:
!pip install pytrec_eval

### Imports

In [None]:
"""
This examples show how to train a Cross-Encoder for the MS Marco dataset (https://github.com/microsoft/MSMARCO-Passage-Ranking).

The query and the passage are passed simoultanously to a Transformer network. The network then returns
a score between 0 and 1 how relevant the passage is for a given query.

The resulting Cross-Encoder can then be used for passage re-ranking: You retrieve for example 100 passages
for a given query, for example with ElasticSearch, and pass the query+retrieved_passage to the CrossEncoder
for scoring. You sort the results then according to the output of the CrossEncoder.

This gives a significant boost compared to out-of-the-box ElasticSearch / BM25 ranking.
"""
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sentence_transformers import InputExample
from datetime import datetime
import gzip
import os
import tarfile
import tqdm
import logging
from collections import defaultdict
import numpy as np
import sys
import pytrec_eval
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S')

## Evaluating preparation

### Initialize hyperparameters (e.g., batch size, etc)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
base_path = "./gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/"

In [None]:
!mkdir -p $base_path

## Evaluate the model


### Load the fine-tuned model that you trained using the previous notebook. You need to set the path of your own fine-tuned model here.

In [None]:
model_save_path = "/content/gdrive/MyDrive/cross-encoder-reranker-ir-course-2023/finetuned_models/cross-encoder-cross-encoder-ms-marco-MiniLM-L-2-v2-2024-05-10_20-46-58" #@param {type:"string"}

### Load data (For evaluation on TREC DL'19)

In [None]:
!wget https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz
!tar -xvzf  queries.tar.gz

In [None]:
import pickle
"""
This file evaluates CrossEncoder on the TREC 2019 Deep Learning (DL) Track: https://arxiv.org/abs/2003.07820

TREC 2019 DL is based on the corpus of MS Marco. MS Marco provides a sparse annotation, i.e., usually only a single
passage is marked as relevant for a given query. Many other highly relevant passages are not annotated and hence are treated
as an error if a model ranks those high.

TREC DL instead annotated up to 200 passages per query for their relevance to a given query. It is better suited to estimate
the model performance for the task of reranking in Information Retrieval.

Run:
python eval_cross-encoder-trec-dl.py cross-encoder-model-name

"""


data_folder = 'trec2019-data'
os.makedirs(data_folder, exist_ok=True)

#Read test queries
# queries = {}
# queries_filepath = os.path.join(data_folder, 'msmarco-test2019-queries.tsv.gz')
# if not os.path.exists(queries_filepath):
#     logging.info("Download "+os.path.basename(queries_filepath))
#     util.http_get('https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz', queries_filepath)

# with gzip.open(queries_filepath, 'rt', encoding='utf8') as fIn:
#     for line in fIn:
#         qid, query = line.strip().split("\t")
#         queries[qid] = query
with open('truncated_expanded_queries.pkl', 'rb') as f:
    queries = pickle.load(f)


#Read which passages are relevant
relevant_docs = defaultdict(lambda: defaultdict(int))
qrels_filepath = os.path.join(data_folder, '2019qrels-pass.txt')

if not os.path.exists(qrels_filepath):
    logging.info("Download "+os.path.basename(qrels_filepath))
    util.http_get('https://trec.nist.gov/data/deep/2019qrels-pass.txt', qrels_filepath)


with open(qrels_filepath) as fIn:
    for line in fIn:
        qid, _, pid, score = line.strip().split()
        score = int(score)
        if score > 0:
            relevant_docs[qid][pid] = score

# Only use queries that have at least one relevant passage
relevant_qid = []
for qid in queries:
    if len(relevant_docs[qid]) > 0:
        relevant_qid.append(qid)


# Read the top 1000 passages that are supposed to be re-ranked
passage_filepath = os.path.join(data_folder, 'msmarco-passagetest2019-top1000.tsv.gz')

if not os.path.exists(passage_filepath):
    logging.info("Download "+os.path.basename(passage_filepath))
    util.http_get('https://msmarco.z22.web.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz', passage_filepath)



passage_cand = {}
with gzip.open(passage_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        qid, pid, query, passage = line.strip().split("\t")
        if qid not in passage_cand:
            passage_cand[qid] = []

        passage_cand[qid].append([pid, passage])

logging.info("Queries: {}".format(len(queries)))


## Prediction

### Mini

In [None]:
queries_result_list = []
run = {}
model = CrossEncoder(model_save_path, max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

### Distil

In [None]:
queries_result_list = []
run = {}
model = CrossEncoder(model_save_path, max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

### Tiny

In [None]:
queries_result_list = []
run = {}
model = CrossEncoder(model_save_path, max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

## Evaluation

### Mini

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

### Distil

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

### Tiny

In [None]:
evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10', 'recall_100', 'map_cut.1000'})
scores = evaluator.evaluate(run)

print("Queries:", len(relevant_qid))
print("NDCG@10: {:.2f}".format(np.mean([ele["ndcg_cut_10"] for ele in scores.values()])*100))
print("Recall@100: {:.2f}".format(np.mean([ele["recall_100"] for ele in scores.values()])*100))
print("MAP@1000: {:.2f}".format(np.mean([ele["map_cut_1000"] for ele in scores.values()])*100))

## Sorting candidate documents of each query based on their relevance score

In [None]:
import operator
for qid in run.keys():
  run[qid] = sorted(run[qid].items(), key=operator.itemgetter(1), reverse = True)

## Storing ranking run file

In [None]:
ranking_lines = []
for qid in run.keys():
  for rank, did_pred_score in enumerate(run[qid]):
    did, pred_score = did_pred_score
    line = "{qid} Q0 {did} {rank} {pred_score} STANDARD".format(qid=qid, did=did, rank=rank, pred_score=str(pred_score))
    ranking_lines.append(line)

In [None]:
ranking_run_file_path = model_save_path + "ranking.run"
f_w = open(ranking_run_file_path, "w+")
f_w.write("\n".join(ranking_lines))
f_w.close()

### Print the first three lines of the stored ranking run file

#### Mini

In [None]:
!head -n 3 $ranking_run_file_path

#### Distil

In [None]:
!head -n 3 $ranking_run_file_path

### Tiny

In [None]:
!head -n 3 $ranking_run_file_path