In [None]:
import json
import logging
import pandas as pd
from sentence_transformers import CrossEncoder

from financerag.tasks import FinDER
from financerag.retrieval import DenseRetrieval, SentenceTransformerEncoder
from financerag.rerank import CrossEncoderReranker
from financerag.tasks.BaseTask import BaseTask  # optional if you prefer static evaluate

logging.basicConfig(level=logging.INFO)

CORPUS_PATH = "/Users/vikashpr/Dev/Python/FinanceRAG/icaif-24-finance-rag-challenge/finder_corpus.jsonl/corpus.jsonl"
QUERY_PATH = "/Users/vikashpr/Dev/Python/FinanceRAG/icaif-24-finance-rag-challenge/finder_queries.jsonl/queries.jsonl"
QRELS_PATH = "/Users/vikashpr/Dev/Python/FinanceRAG/icaif-24-finance-rag-challenge/FinDER_qrels.tsv"

def load_jsonl(path):
    with open(path, "r") as f:
        for line in f:
            yield json.loads(line)

class LocalFinDER(FinDER):
    def load_data(self):
        # override BaseTask.load_data so it doesn't try HF
        self.queries = {}
        self.corpus = {}

corpus = {
    doc["_id"]: {"title": doc.get("title", ""), "text": doc.get("text", "")}
    for doc in load_jsonl(CORPUS_PATH)
}
queries = {q["_id"]: q["text"] for q in load_jsonl(QUERY_PATH)}

finder_task = LocalFinDER()
finder_task.corpus = corpus
finder_task.queries = queries

df = pd.read_csv(QRELS_PATH, sep="\t")
qrels = df.groupby("query_id").apply(lambda g: dict(zip(g["corpus_id"], g["score"]))).to_dict()

print("Querry Result", qrels)

encoder = SentenceTransformerEncoder(
    model_name_or_path="intfloat/e5-large-v2",
    query_prompt="query: ",
    doc_prompt="passage: ",
)
retriever = DenseRetrieval(model=encoder)

retrieval_result = finder_task.retrieve(retriever=retriever, top_k=200)

reranker = CrossEncoderReranker(
    model=CrossEncoder("cross-encoder/ms-marco-MiniLM-L-12-v2")
)
reranking_result = finder_task.rerank(
    reranker=reranker,
    results=retrieval_result,
    top_k=100,
    batch_size=32,
)

ndcg, map_, recall, precision = finder_task.evaluate(
    qrels=qrels,
    results=reranking_result,
    k_values=[1, 5, 10],
)
print("NDCG:", ndcg)
print("MAP:", map_)
print("Recall:", recall)
print("Precision:", precision)

finder_task.save_results(output_dir="./results")

Enbedding Model: https://huggingface.co/intfloat/e5-large-v2

This model has 24 layers and the embedding size is 1024.


- Corpus: 13,867 documents (from finder_corpus.jsonl)
- Queries: 218 queries (from finder_queries.jsonl)
- Encoding: The model encodes all documents in 217 batches for the corpus and 4 batches for queries
- Reranking: 675 batches of query-document pairs for cross-encoder reranking

Batching is used for memory efficiency and computational performance:

- Query Encoding (4 batches): Your 128 queries are encoded in batches to avoid GPU/CPU memory overflow
- Corpus Encoding (217 batches): The ~13,900 documents are too large to encode at once, so they're processed in chunks of 64 documents (default batch size)
- Reranking (675 batches): The cross-encoder processes query-document pairs in batches of 32 to compute relevance scores