In [None]:
from whoosh.fields import Schema, TEXT, ID
import nltk
nltk.download('punkt')

import os

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

schema = Schema(docid=TEXT(stored=True), title=ID(stored=True), content=TEXT(stored=True))


In [None]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5

## Make Excel with BM25

In [None]:
import pandas as pd
import json

def save_bm25_to_excel(bm25_results_file, output_excel_file="bm25_results.xlsx"):
    with open(bm25_results_file, 'r') as f:
        bm25_results_data = json.load(f)

    rows = []
    
    for topic in bm25_results_data:
        topic_id = topic[0]["topic_id"]
        description = topic[0]["question"]
        
        for rank, result in enumerate(topic):
            doc_id = result["docid"]
            doc_content = result["content"]
            score = result["score"]
            rows.append([topic_id, description, doc_id, rank + 1, score,doc_content])

    df = pd.DataFrame(rows, columns=['q_id', 'q_data', 'doc_id', 'rank', 'score', 'doc_data'])
    df.to_excel(output_excel_file, index=False)

    print(f"Results saved to {output_excel_file}")

bm25_results_file = 'json_results_newest.json'  #
save_bm25_to_excel(bm25_results_file, output_excel_file="bm25_results.xlsx")


## Reranker

In [None]:

from transformers import AutoTokenizer,T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained('castorini/duot5-base-med-msmarco')

import re

def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)  
    text = text.replace("\n", " ").replace("\r", "") 
    return text[:32767] 

def rerank_with_monoT5(query_description, bm25_results, top_k=200):
    query = Query(query_description)
    passages = [
        [result["doc_id"], result["content"],result["score"]] for result in bm25_results[:top_k]
    ]
 
    texts = []
    for p in passages:
        doc_id = p[0]
        doc_content = p[1]
        
        inputs = tokenizer(doc_content, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
        tokens = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)
        token_type = inputs.get("token_type_ids", None)  

        texts.append(Text(doc_content, {'docid': doc_id, 'content': doc_content, 'tokens': tokens, 'attention_mask': attention_mask, 'token_type': token_type}, 0))

    model = T5ForConditionalGeneration.from_pretrained('castorini/duot5-base-med-msmarco')
    reranker = MonoT5(model=model)
    reranked = reranker.rerank(query, texts)
    reranked_results = sorted(reranked, key=lambda x: x.score, reverse=True)
    
    return reranked_results[:top_k]

def save_to_excel(reranked_results, queries, file_path):
    rows = []
    for query, reranked in zip(queries, reranked_results):
        query_id = query['topic_id']
        query_description = query['description']
        
        for rank, result in enumerate(reranked):
            doc_id = result["doc_id"]
            doc_content = clean_text(result["content"])  
            score = result["score"]
            rank = result["rank"]
            tokens = result["tokens"]  
            attention_mask = result["attention_mask"]
            token_type = result["token_type"]
           
            rows.append([query_id, query_description, doc_id, rank + 1, score, tokens, attention_mask, token_type, doc_content])
    
    df = pd.DataFrame(rows, columns=['q_id', 'q_data', 'doc_id', 'rank', 'score', 'tokens', 'attention_mask', 'token_type', 'doc_data'])
    df.to_excel(file_path, index=False)

def eval_all_queries_and_save(bm25_results_file, top_k=200, output_excel_file="reranked_results.xlsx"):
 
    with open(bm25_results_file, 'r') as f:
        bm25_results_data = json.load(f)

    results_all = []
    queries = []

    i = 0
    for topic in bm25_results_data:
        print(i)
        i+=1
        if (i<=100):
            topic_results = []
            topic_id = topic[0]["topic_id"]
            description = topic[0]["question"]
            queries.append({"topic_id": topic_id, "description": description})
        
            bm25_results = [
                {"topic_id": topic_id, "doc_id": result["docid"], "rank": result["rank"], "score": result["score"], "content": result["content"]}
                for result in topic
            ]
            
            reranked_results = rerank_with_monoT5(description, bm25_results)
    
            for rank, reranked in enumerate(reranked_results):
                    trec_entry = {
                        "topic_id": topic_id,
                        "doc_id": reranked.metadata["docid"],
                        "content": reranked.metadata["content"],
                        "rank": rank + 1,
                        "score": reranked.score,
                        "tokens" : reranked.metadata["tokens"],
                        "attention_mask" : reranked.metadata["attention_mask"],
                        "token_type" : reranked.metadata["token_type"]
                    }
                    topic_results.append(trec_entry)
                
            results_all.append(topic_results)


    save_to_excel(results_all, queries, output_excel_file)

    print(f"Results saved to {output_excel_file}")
    
 


bm25_results_file = 'json_results_newest.json' 
eval_all_queries_and_save(bm25_results_file, output_excel_file="neural_rerank_results.xlsx")
