# IR PROJECT CODE

Members:

*   Omer
*   Lucas
*   Muhamad

# Download and Inspect the Collection

The dataset was created from the Chronicling America collection — over 21 million digitized newspaper pages (1756–1963) curated by the Library of Congress and NEH. They used 39,330 pages (1800–1920), representing 53 US states, to ensure wide geographic and temporal coverage.

Source: https://dl.acm.org/doi/pdf/10.1145/3626772.3657891

GitHub: https://github.com/DataScienceUIBK/ChroniclingAmericaQA?tab=readme-ov-file

Install all the dependencies for 1st/2nd stages retrieval and QA model

LLM MODEL GEMMA3 270M setup for question answering

Based on: https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(270M).ipynb

In [None]:
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install pyterrier
!pip install pyterrier[java]
!pip install gensim
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git
!pip install nltk
!pip install spacy -q
!python -m spacy download en_core_web_sm -q

from unsloth import FastLanguageModel

import torch
import torch.nn.functional as F
import pyterrier as pt
from pyterrier.measures import P, MAP, R, nDCG
from pyterrier_t5 import MonoT5ReRanker
import os
import json
import pandas as pd
import shutil
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

import numpy as np
import pickle
from gensim.models import KeyedVectors

import csv
import xml.etree.ElementTree as ET
from xml.dom import minidom

In [1]:
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o validation.json

import json

files = ["train.json", "validation.json", "test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1077  100  1077    0     0   9315      0 --:--:-- --:--:-- --:--:--  9365
100 71.5M  100 71.5M    0     0  33.4M      0  0:00:02  0:00:02 --:--:-- 72.9M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1075  100  1075    0     0  10770      0 --:--:-- --:--:-- --:--:-- 10858
100 1315M  100 1315M    0     0   110M      0  0:00:11  0:00:11 --:--:-- 74.7M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1071  100  1071    0     0  10710      0 --:--:-- --:--:-- --:--:-- 10818
100 71.8M  100 71.8M    0     0  46.8M      0  0:00:01  0:00:01 --:--:-- 82.0M

===== train.json =====
Preview of first 500 charact

# Create the Document Collection

To do that, we create a new json file that contains the 'para_id', 'context', 'raw_ocr', 'publication_date' keys, for all para_id in the collection.

para_id: is the id of a paragraph of a news paper page.

In [2]:
import json
import os

inputs = ["train.json", "validation.json", "test.json"]
output = "document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

Loaded 439302 records from train.json
Loaded 24111 records from validation.json
Loaded 24084 records from test.json
Wrote 131921 records to document_collection.json
[
  {
    "para_id": "New_Hampshire_18070804_1",
    "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, or the American Turf in Tripoli,\u201d VOTARIES of Freedom, arm! The British Lion roars! Legions of Valor, take th\u2019 alarm\u2014; Rash, rush to guard our shores! Behold the horrid deed\u2014 Your brethren gasping lie! Beneath a tyrant\u2019s hand they bleed\u2014 They groan\u2014they faint\u2014they die. Veterans of seventy-six, Awake the slumbering sword;\u2014 Hearts of your murderous foes transfix\u2014 'Tis vengeance gives the word. Remember Lexington, And Bunker\u2019s tragic hill; \u201cThe same who spilt your blood thereon, Your blood again

## You should check that the collection you have matches that of the paper!

# Create the Test Queries Data Structure

We keep the first 10.000 queries due to memory errors in the free colab version.

To be comparable, please keep the top 10.000 queries for evaluation.

In [3]:
import json
import re
import unicodedata
import string

input_file = "test.json"
output_file = "test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

Saved 10000 entries to test_queries.json
[
  {
    "query_id": "test_1",
    "question": "How many lots did Thomas Peirce have"
  },
  {
    "query_id": "test_10",
    "question": "Who gave Hamilton the substance of what he had proposed on the part of General Hamilton"
  },
  {
    "query_id": "test_100",
    "question": "Who informs his FRIENDS and the PUBLIC that he has taken that justly celebrated INN in this city"
  }
]


# Create the Qrels for the test set

In [4]:
input_file = "test.json"
qrels_file = "test_qrels.json"
answers_file = "test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

Saved 24084 entries to test_qrels.json
Saved 24084 entries to test_query_answers.json
Sample qrels entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1}
Sample query_answers entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1, 'answer': '183', 'org_answer': '183'}


# Retrieval - Good Luck!

# Create the Index of the collection



In [None]:
#path index folder
folder_name = "Index"
index_file_name = "terrier_inverted_index"
index_path = os.path.abspath(os.path.join(folder_name, index_file_name))

#init pyTer
if not pt.java.started():
    pt.java.init()

def get_index():
    #check if the index exsists
    properties_file = os.path.join(index_path, "data.properties")
    # Forcing a rebuild by removing the index directory if it exists to ensure updated schema.
    if os.path.exists(index_path):
        print(f"Existing index found at {index_path}. Removing to ensure rebuild with updated schema.")
        shutil.rmtree(index_path)

    print("Index is not found or is being rebuilt, creating a new Index")
    os.makedirs(index_path, exist_ok=True)

    with open('document_collection.json', 'r', encoding='utf-8') as f:
        raw_data = json.load(f)

    # Build Data Frame to include docno, context, raw_ocr, and publication_date
    df = pd.DataFrame(raw_data)
    df = df.rename(columns={"para_id": "docno"})[["docno", "context", "raw_ocr", "publication_date"]]

    df['text'] = df['context'] # Create a 'text' column for the primary indexed field

    # Build the index
    indexer = pt.IterDictIndexer(
        index_path,
        meta={"docno": 24, "context": 1024, "raw_ocr": 1024},
        text_attrs=["text"],           # 'text' (our 'context') is the primary field for indexing content
        meta_reverse=["docno"],        # enable reverse lookup on docno
        pretokenised=False,
        fields=False,
        threads=1,
    )
    index_ref = indexer.index(df.to_dict(orient="records"))
    return pt.IndexFactory.of(index_ref)
index = get_index()

# Print a simple summary
print("Index location:", index_path)
print("Indexed documents:", index.getCollectionStatistics().getNumberOfDocuments())

terrier-assemblies 5.11 jar-with-dependencies not found, downloading to /root/.pyterrier...


https://repo1.maven.org/maven2/org/terrier/terrier-assemblies/5.11/terrier-assemblies-5.11-jar-with-dependenci…

Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...


https://repo1.maven.org/maven2/org/terrier/terrier-python-helper/0.0.8/terrier-python-helper-0.0.8.jar:   0%| …

Done


Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


Index is not found or is being rebuilt, creating a new Index


  warn(msg)
  warn(msg)


14:05:03.622 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
Index location: /content/Index/terrier_inverted_index
Indexed documents: 131921


In [None]:
# Retrieve collection statistics
stats = index.getCollectionStatistics()

print("Terrier Collection Statistics")
print("--------------------------------")
print(f"Indexed documents:        {stats.getNumberOfDocuments()}")
print(f"Unique terms (vocabulary): {stats.getNumberOfUniqueTerms()}")
print(f"Total tokens:             {stats.getNumberOfTokens()}")
print(f"Average document length:  {stats.getAverageDocumentLength():.2f}")

Terrier Collection Statistics
--------------------------------
Indexed documents:        131921
Unique terms (vocabulary): 236646
Total tokens:             15575099
Average document length:  118.06


# SETUP QUERY PIPELINE (Query expansion histword -> query filtering)

## Set up query expansion with histwords

In [None]:
!wget http://snap.stanford.edu/historical_embeddings/eng-fiction-all_sgns.zip
!unzip -n eng-fiction-all_sgns.zip
!ls -F

--2026-01-22 14:09:52--  http://snap.stanford.edu/historical_embeddings/eng-fiction-all_sgns.zip
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400120997 (382M) [application/zip]
Saving to: ‘eng-fiction-all_sgns.zip’


2026-01-22 14:10:18 (14.4 MB/s) - ‘eng-fiction-all_sgns.zip’ saved [400120997/400120997]

Archive:  eng-fiction-all_sgns.zip
   creating: sgns/
  inflating: sgns/1990-vocab.pkl     
  inflating: sgns/1970-w.npy         
  inflating: sgns/1810-w.npy         
  inflating: sgns/1840-vocab.pkl     
  inflating: sgns/1920-vocab.pkl     
  inflating: sgns/1950-w.npy         
  inflating: sgns/1980-w.npy         
  inflating: sgns/1950-vocab.pkl     
  inflating: sgns/1830-w.npy         
  inflating: sgns/1830-vocab.pkl     
  inflating: sgns/1880-vocab.pkl     
  inflating: sgns/1850-w.npy         
  inflating: sgns/1930-w.n

In [None]:
def load_histwords_colab(year):
    path_vocab = f"sgns/{year}-vocab.pkl"
    path_vectors = f"sgns/{year}-w.npy"

    print(f"Loading {year} data...")

    with open(path_vocab, 'rb') as f:
        vocab = pickle.load(f)

    vectors = np.load(path_vectors)

    kv = KeyedVectors(vector_size=vectors.shape[1])
    kv.add_vectors(vocab, vectors)

    print(f"Success! Model for {year} loaded with {len(vocab)} words.")
    return kv

hist_model = load_histwords_colab(1890)

Loading 1890 data...
Success! Model for 1890 loaded with 100000 words.


In [None]:
hist_models_by_year = {}
for year in range(1800, 1901, 10):
    try:
        hist_models_by_year[year] = load_histwords_colab(year)
    except Exception as e:
        print(f"Could not load model for year {year}: {e}")

print("All requested historical models loaded:")
for year, model in hist_models_by_year.items():
    print(f"  Year {year}: {len(model.key_to_index)} words loaded.")

Loading 1800 data...
Success! Model for 1800 loaded with 100000 words.
Loading 1810 data...
Success! Model for 1810 loaded with 100000 words.
Loading 1820 data...
Success! Model for 1820 loaded with 100000 words.
Loading 1830 data...
Success! Model for 1830 loaded with 100000 words.
Loading 1840 data...
Success! Model for 1840 loaded with 100000 words.
Loading 1850 data...
Success! Model for 1850 loaded with 100000 words.
Loading 1860 data...
Success! Model for 1860 loaded with 100000 words.
Loading 1870 data...
Success! Model for 1870 loaded with 100000 words.
Loading 1880 data...
Success! Model for 1880 loaded with 100000 words.
Loading 1890 data...
Success! Model for 1890 loaded with 100000 words.
Loading 1900 data...
Success! Model for 1900 loaded with 100000 words.
All requested historical models loaded:
  Year 1800: 100000 words loaded.
  Year 1810: 100000 words loaded.
  Year 1820: 100000 words loaded.
  Year 1830: 100000 words loaded.
  Year 1840: 100000 words loaded.
  Year 18

In [None]:
selected_year = 1900 # Change this year to explore different historical models
word = "gay"

# Get the historical model for the selected year
current_hist_model = hist_models_by_year.get(selected_year)

if current_hist_model:
    if word in current_hist_model:
        print(f"Synonyms in {selected_year} for '{word}':")
        print(current_hist_model.most_similar(word, topn=3))
    else:
        print(f"'{word}' not found in the {selected_year} historical model.")
else:
    print(f"Historical model for year {selected_year} is not loaded.")

Synonyms in 1900 for 'gay':
[('lively', 0.4499954581260681), ('cheery', 0.43901199102401733), ('lark', 0.4136286675930023)]


  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


## Query expansion implementation

In [None]:
import functools
selected_year = 1900 # Chosen year that will be explained later
current_hist_model = hist_models_by_year.get(selected_year)

def hist_expansion_logic(row, top_n_expanded_words, hist_model):
    original_terms = row['query'].split()
    candidate_expansions = []  # To store (word, score) for all similar words

    for term in original_terms:
        word = term.lower()
        if word in hist_model.key_to_index:
            if np.isclose(hist_model.get_vector(word, norm=False).tolist(), 0).all():
                continue
            try:
                similar_words_and_scores = hist_model.most_similar(word, topn=1)
                for sim_word, score in similar_words_and_scores:
                    if sim_word.lower() not in [t.lower() for t in original_terms]:
                        candidate_expansions.append((sim_word.lower(), score))
            except Exception:
                continue

    unique_expansions_dict = {}
    for word, score in sorted(candidate_expansions, key=lambda x: x[1], reverse=True):
        if word not in unique_expansions_dict:
            unique_expansions_dict[word] = score

    final_expanded_words = list(unique_expansions_dict.keys())[:top_n_expanded_words]

    # Combine original terms and the selected expanded words
    full_query_terms = list(original_terms) + final_expanded_words
    return " ".join(full_query_terms)

hist_query_expansion = pt.apply.query(functools.partial(hist_expansion_logic, top_n_expanded_words=2, hist_model=current_hist_model))


## Query reduction implementation (filtering stopwords)

In [None]:
def query_reduction_by_weight(row):
    original_query = row['query']
    terms = original_query.split()
    reduced_terms = []

    for term in terms:
        if term.lower() not in stop_words:
            reduced_terms.append(term)
    return " ".join(reduced_terms) if reduced_terms else original_query

query_reduction = pt.apply.query(query_reduction_by_weight)

In [None]:
example_query = "How many gays lots did Thomas Peirce have in the year 1800"

expanded_query = hist_query_expansion.transform(pd.DataFrame([{'qid': 'test_q', 'query': example_query}]))
reduced_query = query_reduction.transform(pd.DataFrame([{'qid': 'test_q', 'query': expanded_query['query'].iloc[0]}]))

# Print the original query for comparison.
print(f"Original request: {example_query}")
print(f"Expanded request: {expanded_query['query'].iloc[0]}")
print(f"Reduced request: {reduced_query['query'].iloc[0]}")

Original request: How many gays lots did Thomas Peirce have in the year 1800
Expanded request: How many gays lots did Thomas Peirce have in the year 1800 month does
Reduced request: many gays lots Thomas Peirce year 1800 month


  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


# First stage retrieval pipelines

## Define 3 pipelines (2 baseline then 1 with query expansion)

In [None]:
# First stage retriver with BM25
bm25_1st_stage_retrieval = pt.terrier.Retriever(index, wmodel="BM25")%5
tf_idf_1st_stage_retrieval = pt.terrier.Retriever(index, wmodel="TF_IDF")%5

## Test on a subset of the queries

In [None]:
#testing the engine
query = "Europe"
bm25_results = bm25_1st_stage_retrieval.search(query)
tf_idf_results = tf_idf_1st_stage_retrieval.search(query)
print('-------------------- bm25 results (top 5) --------------------\n',bm25_results.head(5))
print('-------------------- tf_idf results (top 5) --------------------\n',tf_idf_results.head(5))


-------------------- bm25 results (top 5) --------------------
   qid   docid                     docno  rank      score   query
0   1  122024      New_York_19190610_27     0  10.233243  Europe
1   1   61883         Texas_18771215_13     1   9.759719  Europe
2   1    9637      Maryland_18401110_20     2   9.730148  Europe
3   1  110750      Nebraska_19020225_26     3   9.715430  Europe
4   1   36539  Rhode_Island_18511030_25     4   9.642500  Europe
-------------------- tf_idf results (top 5) --------------------
   qid   docid                     docno  rank     score   query
0   1  122024      New_York_19190610_27     0  5.641964  Europe
1   1   61883         Texas_18771215_13     1  5.380893  Europe
2   1    9637      Maryland_18401110_20     2  5.364589  Europe
3   1  110750      Nebraska_19020225_26     3  5.356474  Europe
4   1   36539  Rhode_Island_18511030_25     4  5.316266  Europe


## Comparing each histword model to choose the most relevant one for query expansion

In [None]:
pipelines = []
pipeline_names = []

# Add baseline pipelines
pipelines.append(bm25_1st_stage_retrieval)
pipeline_names.append("BM25 Baseline")

pipelines.append(tf_idf_1st_stage_retrieval)
pipeline_names.append("TF-IDF Baseline")

# Add custom BM25 pipelines for each historical word embedding model
for year, hist_model in hist_models_by_year.items():
    # Create a new hist_query_expansion for the current year's model
    current_hist_expansion = pt.apply.query(functools.partial(hist_expansion_logic, top_n_expanded_words=2, hist_model=hist_model))

    # Combine with query reduction and BM25
    custom_bm25_pipeline_for_year = current_hist_expansion >> query_reduction >> bm25_1st_stage_retrieval

    pipelines.append(custom_bm25_pipeline_for_year)
    pipeline_names.append(f"BM25 + HistQE {year}")

print(f"Created {len(pipelines)} pipelines for evaluation:")
for name in pipeline_names:
    print(f"- {name}")


Created 13 pipelines for evaluation:
- BM25 Baseline
- TF-IDF Baseline
- BM25 + HistQE 1800
- BM25 + HistQE 1810
- BM25 + HistQE 1820
- BM25 + HistQE 1830
- BM25 + HistQE 1840
- BM25 + HistQE 1850
- BM25 + HistQE 1860
- BM25 + HistQE 1870
- BM25 + HistQE 1880
- BM25 + HistQE 1890
- BM25 + HistQE 1900


In [None]:
queries_df = pd.DataFrame(queries)
queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})

qrels_df = pd.DataFrame(qrels)
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

num_queries_subset = 100 # Define the number of queries for the subset
queries_subset_df = queries_df.head(num_queries_subset)

qrels_subset_df = qrels_df[qrels_df['qid'].isin(queries_subset_df['qid'])].copy()

experiment_results = pt.Experiment(
    pipelines,
    queries_subset_df,
    qrels_subset_df,
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=pipeline_names
)

display(experiment_results)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,BM25 Baseline,0.558833,0.52,0.128,0.064,0.64,0.64,0.578709,0.578709
1,TF-IDF Baseline,0.562333,0.52,0.126,0.063,0.63,0.63,0.579284,0.579284
2,BM25 + HistQE 1800,0.544,0.5,0.122,0.061,0.61,0.61,0.560593,0.560593
3,BM25 + HistQE 1810,0.539,0.51,0.118,0.059,0.59,0.59,0.551665,0.551665
4,BM25 + HistQE 1820,0.523,0.48,0.12,0.06,0.6,0.6,0.542021,0.542021
5,BM25 + HistQE 1830,0.530667,0.49,0.122,0.061,0.61,0.61,0.550278,0.550278
6,BM25 + HistQE 1840,0.530333,0.5,0.114,0.057,0.57,0.57,0.540415,0.540415
7,BM25 + HistQE 1850,0.537333,0.5,0.118,0.059,0.59,0.59,0.550593,0.550593
8,BM25 + HistQE 1860,0.547833,0.5,0.124,0.062,0.62,0.62,0.566031,0.566031
9,BM25 + HistQE 1870,0.527833,0.49,0.12,0.06,0.6,0.6,0.545717,0.545717



## Run the full pipelines then save their results as csv files

In [None]:
bm25_custom_1st_stage_retrieval = (
    hist_query_expansion
    >> query_reduction
    >> bm25_1st_stage_retrieval%5
)

In [None]:
first_stage_pipelines = [bm25_1st_stage_retrieval,tf_idf_1st_stage_retrieval,bm25_custom_1st_stage_retrieval]
output_files=["bm25_candidates","tf-idf_candidates","custom_bm25_candidates"]

queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

for pipeline, output_file in zip(first_stage_pipelines, output_files):
    results = pipeline.transform(queries_df)
    results.to_csv(output_file + ".csv", index=False)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


## RESULTS OF THE 1ST STAGE RETRIEVAL



In [None]:
bm25_precomputed_df = pd.read_csv("bm25_candidates.csv")
tf_idf_precomputed_df = pd.read_csv("tf-idf_candidates.csv")
bm25_custom_precomputed_df = pd.read_csv("custom_bm25_candidates.csv")

# Create PyTerrier Transformers from the precomputed DataFrames
bm25_precomputed_pipeline = pt.Transformer.from_df(bm25_precomputed_df)
tf_idf_precomputed_pipeline = pt.Transformer.from_df(tf_idf_precomputed_df)
bm25_custom_precomputed_pipeline = pt.Transformer.from_df(bm25_custom_precomputed_df)

queries_df = pd.DataFrame(queries)
queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})

qrels_df = pd.DataFrame(qrels)
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

experiment_results = pt.Experiment(
    [bm25_precomputed_pipeline, tf_idf_precomputed_pipeline, bm25_custom_precomputed_pipeline],
    queries_df,
    qrels_df,
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["BM25_Precomputed","TF-IDF_Precomputed","BM25_QueryExpansion_histwords_Precomputed"]
)

In [None]:
display(experiment_results)
experiment_results.to_csv("experiment_results_first_stage.csv", index=False) # DON'T FORGET TO DOWNLOAD IT

Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,BM25_Precomputed,0.625393,0.5697,0.14242,0.07121,0.7121,0.7121,0.64715,0.64715
1,TF-IDF_Precomputed,0.625463,0.57,0.14248,0.07124,0.7124,0.7124,0.647257,0.647257
2,BM25_QueryExpansion_histwords_Precomputed,0.57084,0.512,0.13292,0.06646,0.6646,0.6646,0.59432,0.59432


# Second stage retrieval pipelines (by using results generated by BM25)

Load result files generated by BM25 first retrieval (both baseline and custom)

In [None]:
input_files=["bm25_candidates.csv","custom_bm25_candidates.csv"]

raw_bm25_results_df = pd.read_csv(input_files[0])
bm25_precomputed_source = raw_bm25_results_df[['qid', 'docid', 'docno', 'score', 'rank']].copy()
raw_custom_bm25_results_df = pd.read_csv(input_files[1])
custom_bm25_precomputed_source = raw_custom_bm25_results_df[['qid', 'docid', 'docno', 'score', 'rank']].copy()

## Define baseline pipeline

This pipeline answers only with top 1 document reranked (bm25 -> monoT5)

In [None]:
# Initialize MonoT5ReRanker
monoT5_reranker = MonoT5ReRanker(batch_size=32)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
context_baseline_rerank = (
    pt.Transformer.from_df(bm25_precomputed_source)
    >> pt.text.get_text(index, "context")
    >> pt.apply.generic(lambda df: df.rename(columns={'context': 'text'}))
    >> monoT5_reranker%1
)

raw_ocr_baseline_rerank = (
    pt.Transformer.from_df(bm25_precomputed_source)
    >> pt.text.get_text(index, "raw_ocr")
    >> pt.apply.generic(lambda df: df.rename(columns={'raw_ocr': 'text'}))
    >> monoT5_reranker%1
)

## Run the full pipelines then save their results as csv files

In [None]:
second_stage_pipelines = [context_baseline_rerank,raw_ocr_baseline_rerank]
output_files=["monot5_reranked_context_candidates","monot5_reranked_raw_ocr_candidates"]

queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

for pipeline, output_file in zip(second_stage_pipelines, output_files):
    results = pipeline.transform(queries_df)
    results.to_csv(output_file + ".csv", index=False)

## RESULTS OF THE 2ST RETRIEVAL BASELINE


In [None]:
monot5_context_precomputed_df = pd.read_csv("monot5_reranked_context_candidates.csv")
monot5_raw_ocr_precomputed_df = pd.read_csv("monot5_reranked_raw_ocr_candidates.csv")

# Create PyTerrier Transformers from the precomputed DataFrames
monot5_context_precomputed_pipeline = pt.Transformer.from_df(monot5_context_precomputed_df)
monot5_raw_ocr_precomputed_pipeline = pt.Transformer.from_df(monot5_raw_ocr_precomputed_df)

queries_df = pd.DataFrame(queries)
queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})

qrels_df = pd.DataFrame(qrels)
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

experiment_results = pt.Experiment(
    [monot5_context_precomputed_pipeline, monot5_raw_ocr_precomputed_pipeline],
    queries_df,
    qrels_df,
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["MonoT5_Reranked_Context","MonoT5_Reranked_RawOCR"]
)

In [None]:
display(experiment_results)
experiment_results.to_csv("experiment_results_baseline_second_stage.csv", index=False) # DON'T FORGET TO DOWNLOAD IT

Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,MonoT5_Reranked_Context,0.63315,0.6061,0.1333,0.06665,0.6665,0.6665,0.641734,0.641734
1,MonoT5_Reranked_RawOCR,0.57835,0.5363,0.12626,0.06313,0.6313,0.6313,0.591957,0.591957


## Define custom bm25 reranking pipelines

In [None]:
context_custom_rerank = (
    pt.Transformer.from_df(custom_bm25_precomputed_source)
    >> pt.text.get_text(index, "context")
    >> pt.apply.generic(lambda df: df.rename(columns={'context': 'text'}))
    >> monoT5_reranker%3
)

raw_ocr_custom_rerank = (
    pt.Transformer.from_df(custom_bm25_precomputed_source)
    >> pt.text.get_text(index, "raw_ocr")
    >> pt.apply.generic(lambda df: df.rename(columns={'raw_ocr': 'text'}))
    >> monoT5_reranker%3
)

In [None]:
second_stage_pipelines = [context_custom_rerank,raw_ocr_custom_rerank]
output_files=["custom_monot5_reranked_context_candidates","custom_monot5_reranked_raw_ocr_candidates"]

queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

for pipeline, output_file in zip(second_stage_pipelines, output_files):
    results = pipeline.transform(queries_df)
    results.to_csv(output_file + ".csv", index=False)

monoT5:   0%|          | 0/1563 [00:00<?, ?batches/s]

monoT5:   0%|          | 0/1563 [00:00<?, ?batches/s]

In [None]:
custom_monot5_context_precomputed_df = pd.read_csv("custom_monot5_reranked_context_candidates.csv")
custom_monot5_raw_ocr_precomputed_df = pd.read_csv("custom_monot5_reranked_raw_ocr_candidates.csv")

# Create PyTerrier Transformers from the precomputed DataFrames
custom_monot5_context_pipeline = pt.Transformer.from_df(custom_monot5_context_precomputed_df)
custom_monot5_raw_ocr_pipeline = pt.Transformer.from_df(custom_monot5_raw_ocr_precomputed_df)

queries_df = pd.DataFrame(queries)
queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})

qrels_df = pd.DataFrame(qrels)
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

experiment_results = pt.Experiment(
    [custom_monot5_context_pipeline, custom_monot5_raw_ocr_pipeline],
    queries_df,
    qrels_df,
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["Custom_MonoT5_Reranked_Context","Custom_MonoT5_Reranked_RawOCR"]
)

In [None]:
display(experiment_results)
experiment_results.to_csv("experiment_results_custom_second_stage.csv", index=False) # DON'T FORGET TO DOWNLOAD IT

Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,Custom_MonoT5_Reranked_Context,0.598117,0.5748,0.12534,0.06267,0.6267,0.6267,0.605477,0.605477
1,Custom_MonoT5_Reranked_RawOCR,0.552717,0.5159,0.1198,0.0599,0.599,0.599,0.564612,0.564612


### LLM MODEL DEFINITION FOR QUESTION ANSWERING (GEMMA3 270M)

In [None]:
import torch

# --- 1. Model and Tokenizer Definition ---
# This section loads the pre-trained Gemma 3 270M model and its tokenizer.
# The model is loaded in 4-bit quantization for memory efficiency and set for inference.
# Subsequent training (in later cells) will modify this `model` object in-place.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-270m-it-bnb-4bit",
    max_seq_length = 2048,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

# --- 2. Define QA Logic Function ---
# This function uses the `model` and `tokenizer` defined above to generate answers
# based on a given context and question. It is logically defined here because it
# directly depends on the initialized model and tokenizer objects. When the `model`
# is later trained, this function will automatically use the updated, trained model
# when called.
def gemma_qa_logic(df):
    prompts = [
        f"""### Instruction:
Given the following context, provide a concise, English-only answer to the question. If the answer is not present in the context, respond with 'Not found in context'.
{str(row['text'])[:1948]}

### Question:
{row['query']}

### Response:
"""
        for _, row in df.iterrows()
    ]

    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True,max_length=1024).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=30,
            do_sample=False,
            use_cache=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id
        )
    input_len = inputs.input_ids.shape[1]
    decoded_outputs = tokenizer.batch_decode(outputs[:, input_len:], skip_special_tokens=True)

    cleaned_answers = []
    for ans_text in decoded_outputs:
        # Remove any leading/trailing whitespace
        ans_text = ans_text.strip()
        # Look for the 'Answer:' prefix that the LLM might generate based on the prompt
        # This part of the cleaning might not be needed as much if the prompt is perfect
        if ans_text.lower().startswith('answer:'):
            ans_text = ans_text[len('answer:'):].strip()
        # The model is now trained to output 'Response:' so we should strip that.
        if ans_text.lower().startswith('response:'):
            ans_text = ans_text[len('response:'):].strip()
        # Take only the first line if there are multiple lines generated
        ans_text = ans_text.split('\n')[0].strip()
        cleaned_answers.append(ans_text)

    df["answer"] = cleaned_answers
    return df

==((====))==  Unsloth 2026.1.4: Fast Gemma3 patching. Transformers: 4.56.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.
Unsloth: Gemma3 does not support SDPA - switching to fast eager.


## Training the model on a subset of the training dataset

In [None]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

# Load the training & validation data and select a subset
train_data = load_dataset('json', data_files='train.json', split='train').select(range(10000))
val_data = load_dataset('json', data_files='validation.json', split='train').select(range(1000))

# Define the formatting function for the dataset
def formatting_prompts_func(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answer"]
    texts = []
    for question, context, answer in zip(questions, contexts, answers):
        # Format into an instruction-response template including context
        text = f"""### Instruction:
Given the following context, provide a concise, English-only answer to the question. If the answer is not present in the context, respond with 'Not found in context'.
{context}

### Question:
{question}

### Response:
{answer}{tokenizer.eos_token}"""
        texts.append(text)
    return { "text": texts, }

# Apply the formatting function to create the train_dataset and val_dataset
train_dataset = train_data.map(formatting_prompts_func, batched=True)
val_dataset = val_data.map(formatting_prompts_func, batched=True)

In [None]:
# Apply PEFT using FastModel.get_peft_model
model = FastLanguageModel.get_peft_model(
    model,
    r = 128,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
# Set up the SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset, # Set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 70,
        learning_rate = 5e-6,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir="outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

print("Training dataset and SFTTrainer setup complete.")
print(f"Number of examples in training dataset: {len(train_dataset)}")
print(f"Number of examples in validation dataset: {len(val_dataset)}")
print("First formatted example from training dataset:")
print(train_dataset[5]["text"])

trainer.train()

Unsloth: Switching to float32 training since model cannot work with float16
🦥 Unsloth: Padding-free auto-enabled, enabling faster training.
Training dataset and SFTTrainer setup complete.
Number of examples in training dataset: 10000
Number of examples in validation dataset: 1000
First formatted example from training dataset:
### Instruction:
Given the following context, provide a concise, English-only answer to the question. If the answer is not present in the context, respond with 'Not found in context'.
At Westmoreland, Mrs. Sally Lincoln, wife of Mr. Spencer L. aged 28.  At Henrico, Mrs. Polly Adams, consort On Saturday, the 11th ult. Mr. Joseph Meyer, of Hampstead, was found dead in the road, (his horse standing by him) when oy e g e SMITH & RUST Pocket Book Lost.  "LOST last Wednesday between 7 and 8 o’clock in the afternoon, either in the Globe Tavern at the Plains, or on the road leading from thence to Portsmouth, a new Red Morocco Pocket Book ; containing some Money, Notes of 

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 70
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 30,375,936 of 298,474,112 (10.18% trained)


Step,Training Loss
10,4.1479
20,3.9789
30,4.1343
40,4.1585
50,3.9592
60,4.3108
70,4.0838


TrainOutput(global_step=70, training_loss=4.110481752668108, metrics={'train_runtime': 55.4899, 'train_samples_per_second': 5.046, 'train_steps_per_second': 1.261, 'total_flos': 79013251042560.0, 'train_loss': 4.110481752668108, 'epoch': 0.028})

### DEFINE CUSTOM PIPELINE

This pipeline use LLM model to answer based on reranked documents (top1 then top1-3 if possible)

In [None]:
custom_monot5_context_reranked_df_full = pd.read_csv("custom_monot5_reranked_context_candidates.csv")
custom_monot5_raw_ocr_reranked_df_full = pd.read_csv("custom_monot5_reranked_raw_ocr_candidates.csv")

# Make custom_monot5_context_top1_df and custom_monot5_raw_ocr_top1_df global
custom_monot5_context_top1_df = custom_monot5_context_reranked_df_full.loc[custom_monot5_context_reranked_df_full.groupby('qid')['score'].idxmax()].reset_index(drop=True)
custom_monot5_raw_ocr_top1_df = custom_monot5_raw_ocr_reranked_df_full.loc[custom_monot5_raw_ocr_reranked_df_full.groupby('qid')['score'].idxmax()].reset_index(drop=True)

# Helper Transformer to merge input queries with precomputed results
# This is necessary because pt.Experiment passes queries_df (qid, query) as input,
# but the QA pipelines need the corresponding docno, score, and text from the reranked documents.
class QAInputPreparer(pt.Transformer):
    def __init__(self, precomputed_reranked_df_source):
        self.precomputed_reranked_df_source = precomputed_reranked_df_source

    def transform(self, queries_input_df):
        merged_df = pd.merge(queries_input_df[['qid']], self.precomputed_reranked_df_source, on='qid', how='inner', suffixes=('_input', ''))
        return merged_df

# --- Define 4 pipelines ---

# 1. QA with Context (Top 1 document)
# custom_monot5_context_top1_df already contains 'qid', 'query', 'docno', 'score', 'rank', 'text'
qa_context_top1_from_monot5 = (
    QAInputPreparer(custom_monot5_context_top1_df)
    >> pt.apply.generic(gemma_qa_logic, batch_size=32, verbose=True)
)

# 2. QA with Raw OCR (Top 1 document)
# custom_monot5_raw_ocr_top1_df already contains 'qid', 'query', 'docno', 'score', 'rank', 'text'
qa_raw_ocr_top1_from_monot5 = (
    QAInputPreparer(custom_monot5_raw_ocr_top1_df)
    >> pt.apply.generic(gemma_qa_logic, batch_size=32, verbose=True)
)

# 3. QA with Context (Top N documents merged)
def merge_top_texts(df):
    merged_data = []

    for qid, group in df.groupby("qid"):
        query = group["query"].iloc[0]
        full_context = ""

        for i, row in enumerate(group.itertuples()):
            full_context += f"\n--- Document {i+1} ---\n{row.text}"

        merged_data.append({
            "qid": qid,
            "query": query,
            "text": full_context.strip(),
            "docno": f"merged_{qid}",
        })

    return pd.DataFrame(merged_data)
# custom_monot5_context_reranked_df_full already contains 'qid', 'query', 'docno', 'score', 'rank', 'text'
qa_merged_context_from_monot5 = (
    QAInputPreparer(custom_monot5_context_reranked_df_full) # Use the full reranked results
    >> pt.apply.generic(merge_top_texts)
    >> pt.apply.generic(gemma_qa_logic, batch_size=64, verbose=True)
)

# 4. QA with Raw OCR (Top N documents merged)
# custom_monot5_raw_ocr_reranked_df_full already contains 'qid', 'query', 'docno', 'score', 'rank', 'text'
qa_merged_raw_ocr_from_monot5 = (
    QAInputPreparer(custom_monot5_raw_ocr_reranked_df_full) # Use the full reranked results
    >> pt.apply.generic(merge_top_texts)
    >> pt.apply.generic(gemma_qa_logic, batch_size=64, verbose=True)
)

print("Four LLM QA pipelines created: Context Top1, Raw OCR Top1, Merged Context (Top N), Merged Raw OCR (Top N).")

Four LLM QA pipelines created: Context Top1, Raw OCR Top1, Merged Context (Top N), Merged Raw OCR (Top N).


### TESTING QA model on a subset of the queries

In [None]:
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

num_queries_subset = 100
queries_subset_df = queries_df.head(num_queries_subset)
qrels_subset_df = qrels_df[qrels_df['qid'].isin(queries_subset_df['qid'])].copy()

# --- Example LLM QA Output (Context Top 1) ---
print("\n--- Demonstrating LLM QA Pipeline Output (Context Top 1) ---")
# Take a very small sample for demonstration (e.g., first 3 queries)
queries_sample_for_display_df = queries_subset_df.head(10).copy()

# Apply one of the QA pipelines to the small sample
# The pipeline expects a DataFrame with 'qid' and 'query' as input
sample_results_top1_df = qa_context_top1_from_monot5.transform(queries_sample_for_display_df)

# Print the relevant columns from the sample results
print("Sample Results for first 3 queries (Top 1 Context):")
for idx, row in sample_results_top1_df.iterrows():
    print(f"\nQuery (qid: {row['qid']}): {row['query']}")
    print(f"Context used (from docno: {row['docno']}): {row['text'][:300]}...")
    print(f"Generated Answer: {row['answer']}")
print("--------------------------------------------------------------")

# --- Example LLM QA Output (Merged Context Top N) ---
print("\n--- Demonstrating LLM QA Pipeline Output (Merged Context Top N) ---")
# Apply the merged QA pipeline to the same small sample
sample_results_merged_df = qa_merged_context_from_monot5.transform(queries_sample_for_display_df)

# Print the relevant columns from the sample results
print("Sample Results for first 3 queries (Merged Context):")
for idx, row in sample_results_merged_df.iterrows():
    print(f"\nQuery (qid: {row['qid']}): {row['query']}")
    print(f"Context used (from docno: {row['docno']}): {row['text'][:300]}...")
    print(f"Generated Answer: {row['answer']}")
print("--------------------------------------------------------------")


--- Demonstrating LLM QA Pipeline Output (Context Top 1) ---


pt.apply:   0%|          | 0/1 [00:00<?, ?row/s]

Sample Results for first 3 queries (Top 1 Context):

Query (qid: test_1): How many lots did Thomas Peirce have
Context used (from docno: New_Hampshire_18030125_): Axivil Roberts, part of lot 180 108 60 Capt. George Walker, 181 140 35 George Townson, 183 48 19 Samuel Snell, 184 36 9 Samuel Waterhouse, 185 24 6 John Parker, 186 36 10 John Davis, 187 45 20 John Cross, 188 15 4 Benjamin Cross, 189 50 13 Widow Gilman, 209 21 8 George Peirce, 209 200 75 Thomas Peir...
Generated Answer: None"""

Query (qid: test_10): Who gave Hamilton the substance of what he had proposed on the part of General Hamilton
Context used (from docno: West_Virginia_18040803_9): General Hamilton would declare to the best of his recollection what passed in that conversation, and Mr. P. read to Mr. V. N.'s paper containing the substance of what Gen. H. would say on that subject, which was follows—” _ /, f No. 6—‘ Gen. H. says he cannot imagine to what Dr. Cooper may have allu...
Generated Answer: Mr." George Washingto

pt.apply:   0%|          | 0/1 [00:00<?, ?row/s]

Sample Results for first 3 queries (Merged Context):

Query (qid: test_1): How many lots did Thomas Peirce have
Context used (from docno: merged_test_1): --- Document 1 ---
Vol. XVIII. HONOLULU, NOVEMBER 2, 1861. No. 27. PUBLISHED WEEKLY AT HONOLULU, Hawaiian Islands. Abraham Fornander, .... Editor. Business Cards. CHAS. W. BISHOP. A. ALDRICH. BISHOP & CO., BANKERS, Office in the east corner of Makee's Block, Kaahumanu street, Honolulu. Draw Bills of...
Generated Answer: A City Edition Answer July May October Washington County... Alexandria… April June September August November Twenty thousand houses.... United Virginia..... Jackson Valley...... Lake Montgomery.......

Query (qid: test_10): Who gave Hamilton the substance of what he had proposed on the part of General Hamilton
Context used (from docno: merged_test_10): --- Document 1 ---
General Hamilton would declare to the best of his recollection what passed in that conversation, and Mr. P. read to Mr. V. N.'s paper containing the s

In [None]:
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

llm_qa_pipelines = [qa_context_top1_from_monot5, qa_raw_ocr_top1_from_monot5]
llm_output_files = ["llm_answers_context_top1", "llm_answers_raw_ocr_top1"]

for pipeline, output_file in zip(llm_qa_pipelines, llm_output_files):
    results = pipeline.transform(queries_df)
    results.to_csv(output_file + ".csv", index=False)
    print(f"Saved '{output_file}.csv'")

print("Transformation complete. Check the generated CSV files for LLM answers.")

pt.apply:   0%|          | 0/313 [00:00<?, ?row/s]

Saved 'llm_answers_context_top1.csv'


pt.apply:   0%|          | 0/313 [00:00<?, ?row/s]

Saved 'llm_answers_raw_ocr_top1.csv'
Transformation complete. Check the generated CSV files for LLM answers.


### Using 3 documents instead of 1 only document for Question Answering by LLM

To doing so, we have to merge the texts of each documents in one prompt in order to allow LLM model to consider all of them.

In [None]:
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

llm_qa_pipelines = [qa_merged_context_from_monot5, qa_merged_raw_ocr_from_monot5]
llm_output_files = ["llm_answers_context_top3", "llm_answers_raw_ocr_top3"]

for pipeline, output_file in zip(llm_qa_pipelines, llm_output_files):
    results = pipeline.transform(queries_df)
    results.to_csv(output_file + ".csv", index=False)
    print(f"Saved '{output_file}.csv'")

print("Transformation complete. Check the generated CSV files for LLM answers.")

pt.apply:   0%|          | 0/157 [00:00<?, ?row/s]

Saved 'llm_answers_context_top3.csv'


pt.apply:   0%|          | 0/157 [00:00<?, ?row/s]

Saved 'llm_answers_raw_ocr_top3.csv'
Transformation complete. Check the generated CSV files for LLM answers.


In [None]:
# Take a small subset of queries for demonstration
queries_subset_df_for_merge_test = queries_df.head(1)

# Create a temporary pipeline to show the output after merge_top_texts
temp_merge_pipeline = (
    pt.Transformer.from_df(monot5_context_reranked_df)  # Start with all reranked documents
    >> pt.apply.generic(merge_top_texts)
)

# Apply the pipeline to the subset of queries
merged_output_df = temp_merge_pipeline.transform(queries_subset_df_for_merge_test)

print("DataFrame after merge_top_texts (first query):")
# Display the relevant columns, especially the 'text' column
display(merged_output_df[['qid', 'query', 'docno', 'text']].head())

print("\nContent of the merged 'text' for the first query:")
print(merged_output_df['text'].iloc[0])

print("\nAlso, let's verify how many documents were fed to merge_top_texts for the first query.")
# The original monot5_context_reranked_df should contain 3 documents per query at this stage
initial_docs_for_first_query = monot5_context_reranked_df[monot5_context_reranked_df['qid'] == queries_subset_df_for_merge_test['qid'].iloc[0]]
print(f"Number of documents initially retrieved for the first query (before merging): {len(initial_docs_for_first_query)}")

DataFrame after merge_top_texts (first query):


Unnamed: 0,qid,query,docno,text
0,test_1,How many lots did Thomas Peirce have,merged_test_1,"--- Document 1 ---\nIn Gloucester county, (N.J..."



Content of the merged 'text' for the first query:
--- Document 1 ---
In Gloucester county, (N.J.) on the 8th ult. THOMAS SEEDE, a native of the county, aged 104 years, 1 month & 20 days. It is worthy of remark that there have died in this county within the space of two years, three citizens, whose ages together amount to upwards of 317 years.  In this town, Mr. GEORGE Hart; A child of Mr. Gold, and a child of Mr. Thomas Peirce. NOTICE—MAPS OF THE UNITED STATES. NEW, correct and beautifully varnished MAPS of the United States, various prices just received and for sale at the Book store of C. PEIRCE, Daniel street, Portsmouth, where may also be had Account Books, of almost all sizes, prices and qualities. Also A fresh supply of BOOKS & STATIONERY; Penknives; Razors; Scissors; Inkstands; Ink Powder; Wafers; Black- Sand; Marking Ink for Linen; Wash- balls; best Windsor Soap; Ivory and bone paper Folders and Cutters; silver Pencil Cases and Pencils; Morocco-Pocket and Memorandum Books; Lad

### Using ASPIRE

PyTerrier : https://pyterrier.readthedocs.io/en/latest/experiments.html

ASPIRE https://github.com/GiorgosPeikos/ASPIRE

ONLINE: https://aspire-ir-eval.streamlit.app/



Remember the res files are actually the rankings created by our retrieval model!

We downloading it to use another tool for data analysis. The tool is called ASPIRE.

ASPIRE requires a different input format!

In short: this code generates results, queries, qrels files which can be uploaded to ASPIRE

In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from xml.dom import minidom

# --- Define queries_df and qrels_df ---
# These DataFrames are essential for generating queries.xml and qrels.csv.
# They are created from the global 'queries' and 'qrels' lists,
# which are loaded from your test.json file at the beginning of the notebook.
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

# Rename columns to match PyTerrier and ASPIRE's expected formats
queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

# ----- Function to Save Retrieval Results (for ASPIRE) -----
def save_res(df, run_name, out_path, sep="\t"):
    res = df.copy()  # Create a copy to avoid modifying the original DataFrame
    # Normalize 'docid' to 'docno' if necessary
    if "docid" in res.columns and "docno" not in res.columns:
        res = res.rename(columns={"docid": "docno"})
    # Remove query text columns as they are not needed in the result file
    res = res.drop(columns=["query", "query_0"], errors="ignore")
    res["iteration"] = "Q0"  # Set fixed iteration value as required by ASPIRE
    res["experiment_id"] = run_name  # Label for the experimental run
    res = res.rename(columns={"qid": "query_id", "docno": "doc_id"})  # Rename for ASPIRE compatibility
    res["score"] = pd.to_numeric(res["score"]) # Ensure score is numeric
    # Sort results by query_id and score to determine rank
    res = res.sort_values(by=["query_id", "score"], ascending=[True, False])
    # Assign ranks based on sorted scores within each query
    res["rank"] = res.groupby("query_id")["score"].rank(ascending=False, method="first").astype(int)
    # Select and order columns as required by ASPIRE
    res = res[["query_id", "iteration", "doc_id", "rank", "score", "experiment_id"]]
    res.to_csv(out_path, sep=sep, index=False, header=False)  # Save to CSV without header

# --- Load Precomputed Candidates from all first-stage pipelines ---
input_result_files = [
    ("bm25_candidates.csv", "BM25_Baseline", "results_bm25_baseline.csv"),
    ("tf-idf_candidates.csv", "TFIDF_Baseline", "results_tf_idf_baseline.csv"),
    ("custom_bm25_candidates.csv", "BM25_HistQE", "results_bm25_hist_qe.csv")
]

for file_path, run_name, output_filename in input_result_files:
    try:
        # Load the raw results DataFrame
        raw_results_df = pd.read_csv(file_path)

        # Prepare the DataFrame for processing by save_res
        processed_df = raw_results_df[['qid', 'docid', 'docno', 'score', 'rank']].copy()
        save_res(processed_df, run_name, output_filename)
        print(f"Successfully loaded and reformatted {file_path}. Generated {output_filename}.")
    except Exception as e:
        print(f"Could not load or process {file_path}: {e}")

# ----- Generate Queries.xml (for ASPIRE) -----
# Create the root element for the XML file with a task attribute
root = ET.Element("topics", attrib={"task": "Chronicling America QA"})

# Iterate through each query in 'queries_df' and add it as a '<topic>' element
for _, row in queries_df.iterrows():
    topic = ET.SubElement(root, "topic", attrib={"number": str(row["qid"])}) # Add topic node with qid
    topic.text = str(row["query"])  # Add query text as content of the topic node

# Pretty print the XML for better readability
rough_xml = ET.tostring(root, encoding="utf-8")
pretty_xml = minidom.parseString(rough_xml).toprettyxml(indent="  ", encoding="utf-8")

# Write the formatted XML to a file
with open("queries.xml", "wb") as f:
    f.write(pretty_xml)

print("Generated queries.xml file.")

# ----- Generate Qrels.csv (for ASPIRE) -----
qrels_out = qrels_df.copy() # Create a copy of the qrels DataFrame
# Remove 'iteration' if it exists, as we'll set it specifically for ASPIRE
qrels_out = qrels_out.drop(columns=["iteration"], errors="ignore")
# Rename 'docid' to 'docno' if it exists for consistency
if "docid" in qrels_out.columns and "docno" not in qrels_out.columns:
    qrels_out = qrels_out.rename(columns={"docid": "docno"})
# Rename columns to match ASPIRE's expected format
qrels_out = qrels_out.rename(columns={"qid": "query_id", "docno": "doc_id"})
qrels_out["iteration"] = "Q0" # Set the iteration column to "Q0" as required
# Select and order columns as required by ASPIRE (no header)
qrels_out = qrels_out[["query_id", "iteration", "doc_id", "relevance"]]
qrels_out.to_csv("qrels.csv", sep=" ", index=False, header=False)

print("Generated qrels.csv file.")

print("All files generated for ASPIRE compatibility.")

Successfully loaded and reformatted bm25_candidates.csv. Generated results_bm25_baseline.csv.
Successfully loaded and reformatted tf-idf_candidates.csv. Generated results_tf_idf_baseline.csv.
Successfully loaded and reformatted custom_bm25_candidates.csv. Generated results_bm25_hist_qe.csv.
Generated queries.xml file.
Generated qrels.csv file.
All files generated for ASPIRE compatibility.


In [None]:
# To use ASPIRE, we need to download the res files, the qrels and queries in the
# required dataformat and use the interface or use it locally.

# Res file Expected format: query_id, iteration (i.e. Q0), doc_id, rank, score, experiment_id
# Queries Expected format: query_id, query
# Qrels Expected format: query_id, iteration (i.e. Q0), doc_id, relevance (without header row)

# DEMO TIME!

# Evaluate generated answer by gemma-3-270m

## Load the result files generated by the QA Model

In [5]:
import pandas as pd

input_result_files = [
    "llm_answers_context_top1.csv",
    "llm_answers_raw_ocr_top1.csv",
    "llm_answers_context_top3.csv",
    "llm_answers_raw_ocr_top3.csv"
]

llm_answers_context_top1_df = pd.read_csv(input_result_files[0])
llm_answers_raw_ocr_top1_df = pd.read_csv(input_result_files[1])
llm_answers_context_top3_df = pd.read_csv(input_result_files[2])
llm_answers_raw_ocr_top3_df = pd.read_csv(input_result_files[3])

print("### Sample LLM Answers from Context Top 1 Reranking ###")

# Select the first 5 rows as a sample
sample_llm_answers = llm_answers_context_top1_df.head(5)

# Iterate through each selected row and print the relevant columns
for index, row in sample_llm_answers.iterrows():
    print(f"\n--- Sample {index + 1} ---")
    print(f"Query: {row['query']}")
    print(f"Context (truncated): {row['text'][:500]}...")
    print(f"LLM Answer: {row['answer']}")

### Sample LLM Answers from Context Top 1 Reranking ###

--- Sample 1 ---
Query: How many lots did Thomas Peirce have
Context (truncated): Axivil Roberts, part of lot 180 108 60 Capt. George Walker, 181 140 35 George Townson, 183 48 19 Samuel Snell, 184 36 9 Samuel Waterhouse, 185 24 6 John Parker, 186 36 10 John Davis, 187 45 20 John Cross, 188 15 4 Benjamin Cross, 189 50 13 Widow Gilman, 209 21 8 George Peirce, 209 200 75 Thomas Peirce, 220 185 71 SIXTH RANGE: Col. Henry Sherburne, 241 552 150 Nathaniel Roberts, 249 30 8 Jonathan Patridge, 252 60 18 John Sherburne’s widow, 253 24 7 Edward Wells, 254 120 50 John Collins, 272 60 90...
LLM Answer: None"""

--- Sample 2 ---
Query: Who gave Hamilton the substance of what he had proposed on the part of General Hamilton
Context (truncated): General Hamilton would declare to the best of his recollection what passed in that conversation, and Mr. P. read to Mr. V. N.'s paper containing the substance of what Gen. H. would say on that subject, w

## Load the actual answers of the queries

In [6]:
import json

with open('test_query_answers.json', 'r', encoding='utf-8') as f:
    ground_truth_answers = json.load(f)

ground_truth_df = pd.DataFrame(ground_truth_answers)

# Rename columns for clarity and consistency with LLM answer dataframes
ground_truth_df = ground_truth_df.rename(columns={
    'query_id': 'qid',
    'answer': 'original_answer',
    'org_answer': 'original_org_answer'
})[['qid', 'original_answer', 'original_org_answer']]

# --- Filter ground_truth_df to match the 10,000 queries used for LLM generation ---
# The 'queries' variable already holds the 10,000 queries from test_queries.json
# We extract their qids
llm_generated_qids = set([q['query_id'] for q in queries])

# Filter ground_truth_df to only include these 10,000 qids
ground_truth_df = ground_truth_df[ground_truth_df['qid'].isin(llm_generated_qids)].reset_index(drop=True)

print("Ground truth answers loaded and prepared:")
print(ground_truth_df.head())
print(f"Shape of ground_truth_df: {ground_truth_df.shape}")

Ground truth answers loaded and prepared:
       qid            original_answer        original_org_answer
0   test_1                        183                        183
1  test_10                     Taylor                     Taylor
2  test_11  M. R. I. A. Subscriptions  M. R. I. A. Subscriptions
3  test_12                          5                          5
4  test_13               July 01,1796             1st July, 1796
Shape of ground_truth_df: (10000, 3)


# Install evaluation libraries

In [7]:
!pip install rouge-score nltk

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a135f5a0a201d87b0a8283414e53a0786b6b917e67cb354aeccb28948ec7541e
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


## Load_All_LLM_Answers_and_GroundTruth

Load all four LLM answer CSVs and the ground truth answers, then merge them into separate DataFrames for evaluation.


In [13]:
# 2. Create an empty dictionary to store the merged evaluation DataFrames
eval_dfs = {}

# 3. Iterate through the list of LLM answer file paths
for file_path in input_result_files:
    print(f"Processing {file_path}...")
    try:
        # a. Load each LLM answer CSV into a Pandas DataFrame
        llm_df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')

        # b. Rename the 'answer' column in llm_df to 'prediction'
        if 'answer' in llm_df.columns:
            llm_df = llm_df.rename(columns={'answer': 'prediction'})
        else:
            print(f"Warning: 'answer' column not found in {file_path}. Skipping rename.")

        # c. Merge llm_df with the ground_truth_df on the 'qid' column
        # Ensure ground_truth_df is correctly defined from previous steps
        merged_df = llm_df.merge(ground_truth_df, on='qid', how='inner')

        # d. Store the resulting merged DataFrame in the eval_dfs dictionary
        # Use the base name of the CSV file without the extension as the key
        key = os.path.basename(file_path).replace('.csv', '')
        eval_dfs[key] = merged_df
        print(f"Successfully merged {key}.")

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}. Please ensure the path is correct.")
    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")

# 4. Print the head of each newly created eval_df to inspect the merged data.
print("\n--- Head of each merged evaluation DataFrame ---")
for key, df in eval_dfs.items():
    print(f"\nDataFrame: {key}")
    display(df.head())


Processing llm_answers_context_top1.csv...
Successfully merged llm_answers_context_top1.
Processing llm_answers_raw_ocr_top1.csv...
Successfully merged llm_answers_raw_ocr_top1.
Processing llm_answers_context_top3.csv...
Successfully merged llm_answers_context_top3.
Processing llm_answers_raw_ocr_top3.csv...
Successfully merged llm_answers_raw_ocr_top3.

--- Head of each merged evaluation DataFrame ---

DataFrame: llm_answers_context_top1


Unnamed: 0,qid,query,docid,docno,text,score,rank,prediction,original_answer,original_org_answer
0,test_1,How many lots did Thomas Peirce have,129025,New_Hampshire_18030125_,"Axivil Roberts, part of lot 180 108 60 Capt. G...",-0.047048,0,"None""""""",183,183
1,test_10,Who gave Hamilton the substance of what he had...,129031,West_Virginia_18040803_9,General Hamilton would declare to the best of ...,-0.00666,0,"Mr."" George Washington""",Taylor,Taylor
2,test_100,Who informs his FRIENDS and the PUBLIC that he...,129103,Virginia_18090202_2,"Dr. Ree's Cyclopædia, No. 16, is engraved, and...",-0.00148,0,The Alexandria Evening Inn,WILLIAM EATON,WILLIAM EATON
3,test_1000,Who was the Secretary of the Treasury of the U...,47949,Illinois_18691112_67,"In the matter of the sale of bonds, designated...",-6.049517,0,"The Bank at Washington Dalseville""",Jeremiah Parker,Jeremiah Parker
4,test_10000,Who made a speech in front of the Brooks House,39744,New_York_18600903_39,Firecrackers were being set off from all parts...,-0.005824,0,"Mr."" George Washington""",John M. Wilson,John M. Wilson



DataFrame: llm_answers_raw_ocr_top1


Unnamed: 0,qid,query,docid,docno,text,score,rank,prediction,original_answer,original_org_answer
0,test_1,How many lots did Thomas Peirce have,129025,New_Hampshire_18030125_,"Axivil Roberts, partof lot 180 108 g 0\nCapt. ...",-0.03588,0,"A City of England""""""",183,183
1,test_10,Who gave Hamilton the substance of what he had...,129031,West_Virginia_18040803_9,General Hamilton\nwould declare to the best of...,-0.012321,0,"George Washington""""""",Taylor,Taylor
2,test_100,Who informs his FRIENDS and the PUBLIC that he...,129103,Virginia_18090202_2,"* _\n, Dr. Rze's .Cyglopcdza, -\nNo. 16, is x'...",-0.007839,0,"The People.""",WILLIAM EATON,WILLIAM EATON
3,test_1000,Who was the Secretary of the Treasury of the U...,52695,New_Mexico_18610921_13,"ih\nty years, and after thut period nnleemable...",-5.852666,0,"The newly formed State Bank.""",Jeremiah Parker,Jeremiah Parker
4,test_10000,Who made a speech in front of the Brooks House,49739,Virginia_18680502_19,Mr. Brooks divided his hour among the\nDemocra...,-0.029174,0,"The Speakers at last.""",John M. Wilson,John M. Wilson



DataFrame: llm_answers_context_top3


Unnamed: 0,qid,query,text,docno,score,prediction,original_answer,original_org_answer
0,test_1,How many lots did Thomas Peirce have,"--- Document 1 ---\nVol. XVIII. HONOLULU, NOVE...",merged_test_1,-5.545956,A City Edition Answer July May October Washing...,183,183
1,test_10,Who gave Hamilton the substance of what he had...,--- Document 1 ---\nGeneral Hamilton would dec...,merged_test_10,-0.059793,"Mr."" George Washington""",Taylor,Taylor
2,test_100,Who informs his FRIENDS and the PUBLIC that he...,"--- Document 1 ---\nDr. Ree's Cyclopædia, No. ...",merged_test_100,-0.058586,The Alexandria Evening Inn,WILLIAM EATON,WILLIAM EATON
3,test_1000,Who was the Secretary of the Treasury of the U...,--- Document 1 ---\nThe Secretary of War has b...,merged_test_1000,-6.962624,the newly formed Bank at Washington Dalseville...,Jeremiah Parker,Jeremiah Parker
4,test_10000,Who made a speech in front of the Brooks House,--- Document 1 ---\nMr. Brooks divided his hou...,merged_test_10000,-3.671332,"A gentleman.""",John M. Wilson,John M. Wilson



DataFrame: llm_answers_raw_ocr_top3


Unnamed: 0,qid,query,text,docno,score,prediction,original_answer,original_org_answer
0,test_1,How many lots did Thomas Peirce have,"--- Document 1 ---\nAxivil Roberts, partof lot...",merged_test_1,-5.845218,"A City Edition""""""",183,183
1,test_10,Who gave Hamilton the substance of what he had...,--- Document 1 ---\nGeneral Hamilton\nwould de...,merged_test_10,-0.259576,"The private matter presented under special note.""",Taylor,Taylor
2,test_100,Who informs his FRIENDS and the PUBLIC that he...,"--- Document 1 ---\n* _\n, Dr. Rze's .Cyglopcd...",merged_test_100,-0.087709,"The People.""",WILLIAM EATON,WILLIAM EATON
3,test_1000,Who was the Secretary of the Treasury of the U...,--- Document 1 ---\n1J1L TKElsrUr R'l TaULT.\n...,merged_test_1000,-7.545748,The newly formed state c-.ishere under an edic...,Jeremiah Parker,Jeremiah Parker
4,test_10000,Who made a speech in front of the Brooks House,--- Document 1 ---\nMr. Brooks divided his hou...,merged_test_10000,-8.308628,The Mayor suggests an apology from Mrs Higgins...,John M. Wilson,John M. Wilson


All LLM answer files are loaded and merged with the ground truth, The code will calculate the ROUGE and BLEU scores for each of the four evaluation DataFrames. It will define a helper function to perform the scoring and then apply it to each DataFrame in the `eval_dfs` dictionary, storing the results in a new dictionary for summarization.



In [15]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True) # Added to download the missing resource
smooth = SmoothingFunction().method1

def evaluate_llm_answers(eval_df):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    rouge1, rouge2, rougeL, bleu_scores = [], [], [], []

    # Ensure 'original_answer' and 'prediction' columns are strings and handle NaN values
    eval_df["original_answer"] = eval_df["original_answer"].fillna("").astype(str)
    eval_df["prediction"] = eval_df["prediction"].fillna("").astype(str)

    for _, row in eval_df.iterrows():
        # ROUGE scores
        scores = scorer.score(row["original_answer"], row["prediction"])
        rouge1.append(scores["rouge1"].fmeasure)
        rouge2.append(scores["rouge2"].fmeasure)
        rougeL.append(scores["rougeL"].fmeasure)

        # BLEU score
        reference = [nltk.word_tokenize(row["original_answer"])]
        candidate = nltk.word_tokenize(row["prediction"])
        bleu_scores.append(sentence_bleu(reference, candidate, smoothing_function=smooth))

    return {
        "ROUGE-1": sum(rouge1) / len(rouge1) if rouge1 else 0,
        "ROUGE-2": sum(rouge2) / len(rouge2) if rouge2 else 0,
        "ROUGE-L": sum(rougeL) / len(rougeL) if rougeL else 0,
        "BLEU": sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0,
    }

# Dictionary to store evaluation results for each configuration
all_evaluation_results = {}

# Iterate through each merged DataFrame and evaluate
print("--- Evaluating LLM Answer Configurations ---")
if not eval_dfs: # Added check for empty eval_dfs
    print("Warning: eval_dfs dictionary is empty. Please ensure the previous cell (Load_All_LLM_Answers_and_GroundTruth) has been executed.")
for key, df in eval_dfs.items():
    print(f"\nEvaluating {key}...")
    results = evaluate_llm_answers(df)
    all_evaluation_results[key] = results
    print(f"  ROUGE-1: {results['ROUGE-1']:.4f}")
    print(f"  ROUGE-2: {results['ROUGE-2']:.4f}")
    print(f"  ROUGE-L: {results['ROUGE-L']:.4f}")
    print(f"  BLEU:    {results['BLEU']:.4f}")

print("\n--- Summary of All Evaluation Results ---")
for key, results in all_evaluation_results.items():
    print(f"\nConfiguration: {key}")
    for metric, score in results.items():
        print(f"  {metric}: {score:.4f}")

--- Evaluating LLM Answer Configurations ---

Evaluating llm_answers_context_top1...
  ROUGE-1: 0.0749
  ROUGE-2: 0.0106
  ROUGE-L: 0.0747
  BLEU:    0.0077

Evaluating llm_answers_raw_ocr_top1...
  ROUGE-1: 0.0599
  ROUGE-2: 0.0086
  ROUGE-L: 0.0597
  BLEU:    0.0043

Evaluating llm_answers_context_top3...
  ROUGE-1: 0.0503
  ROUGE-2: 0.0071
  ROUGE-L: 0.0502
  BLEU:    0.0045

Evaluating llm_answers_raw_ocr_top3...
  ROUGE-1: 0.0371
  ROUGE-2: 0.0050
  ROUGE-L: 0.0370
  BLEU:    0.0023

--- Summary of All Evaluation Results ---

Configuration: llm_answers_context_top1
  ROUGE-1: 0.0749
  ROUGE-2: 0.0106
  ROUGE-L: 0.0747
  BLEU: 0.0077

Configuration: llm_answers_raw_ocr_top1
  ROUGE-1: 0.0599
  ROUGE-2: 0.0086
  ROUGE-L: 0.0597
  BLEU: 0.0043

Configuration: llm_answers_context_top3
  ROUGE-1: 0.0503
  ROUGE-2: 0.0071
  ROUGE-L: 0.0502
  BLEU: 0.0045

Configuration: llm_answers_raw_ocr_top3
  ROUGE-1: 0.0371
  ROUGE-2: 0.0050
  ROUGE-L: 0.0370
  BLEU: 0.0023


Save the results in a csv file

In [16]:
import pandas as pd

# Convert the dictionary of evaluation results to a DataFrame
results_df = pd.DataFrame.from_dict(all_evaluation_results, orient='index')

# Save the DataFrame to a CSV file
output_csv_path = "llm_evaluation_results.csv"
results_df.to_csv(output_csv_path, index=True, index_label="Configuration")

print(f"Evaluation results saved to {output_csv_path}")
print("Here's a preview of the saved data:")
print(results_df)

Evaluation results saved to llm_evaluation_results.csv
Here's a preview of the saved data:
                           ROUGE-1   ROUGE-2   ROUGE-L      BLEU
llm_answers_context_top1  0.074928  0.010555  0.074712  0.007696
llm_answers_raw_ocr_top1  0.059911  0.008631  0.059709  0.004322
llm_answers_context_top3  0.050272  0.007139  0.050157  0.004509
llm_answers_raw_ocr_top3  0.037144  0.005003  0.037016  0.002276
