# IR PROJECT CODE

Members:

*   Omer
*   Lucas
*   Muhamad

# Download and Inspect the Collection

The dataset was created from the Chronicling America collection — over 21 million digitized newspaper pages (1756–1963) curated by the Library of Congress and NEH. They used 39,330 pages (1800–1920), representing 53 US states, to ensure wide geographic and temporal coverage.

Source: https://dl.acm.org/doi/pdf/10.1145/3626772.3657891

GitHub: https://github.com/DataScienceUIBK/ChroniclingAmericaQA?tab=readme-ov-file

In [3]:
!pip install pyterrier
!pip install pyterrier[java]
!pip install gensim
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git

Collecting git+https://github.com/terrierteam/pyterrier_t5.git
  Cloning https://github.com/terrierteam/pyterrier_t5.git to /tmp/pip-req-build-54m6nu97
  Running command git clone --filter=blob:none --quiet https://github.com/terrierteam/pyterrier_t5.git /tmp/pip-req-build-54m6nu97
  Resolved https://github.com/terrierteam/pyterrier_t5.git to commit 81f4c49c6541e6ceedffd2e705cf2fe20089c3ae
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [4]:
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o validation.json

import json

files = ["train.json", "validation.json", "test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1356  100  1356    0     0   6407      0 --:--:-- --:--:-- --:--:--  6426
100 71.5M  100 71.5M    0     0  92.0M      0 --:--:-- --:--:-- --:--:--  184M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1348  100  1348    0     0   4310      0 --:--:-- --:--:-- --:--:--  4320
100 1315M  100 1315M    0     0  60.6M      0  0:00:21  0:00:21 --:--:-- 54.1M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1350  100  1350    0     0   3071      0 --:--:-- --:--:-- --:--:--  3103
100 71.8M  100 71.8M    0     0  50.1M      0  0:00:01  0:00:01 --:--:--  103M

===== train.json =====
Preview of first 500 charact

# Create the Document Collection

To do that, we create a new json file that contains the 'para_id', 'context', 'raw_ocr', 'publication_date' keys, for all para_id in the collection.

para_id: is the id of a paragraph of a news paper page.

In [5]:
import json
import os

inputs = ["train.json", "validation.json", "test.json"]
output = "document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

Loaded 439302 records from train.json
Loaded 24111 records from validation.json
Loaded 24084 records from test.json
Wrote 131921 records to document_collection.json
[
  {
    "para_id": "New_Hampshire_18070804_1",
    "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, or the American Turf in Tripoli,\u201d VOTARIES of Freedom, arm! The British Lion roars! Legions of Valor, take th\u2019 alarm\u2014; Rash, rush to guard our shores! Behold the horrid deed\u2014 Your brethren gasping lie! Beneath a tyrant\u2019s hand they bleed\u2014 They groan\u2014they faint\u2014they die. Veterans of seventy-six, Awake the slumbering sword;\u2014 Hearts of your murderous foes transfix\u2014 'Tis vengeance gives the word. Remember Lexington, And Bunker\u2019s tragic hill; \u201cThe same who spilt your blood thereon, Your blood again

## You should check that the collection you have matches that of the paper!

# Create the Test Queries Data Structure

We keep the first 10.000 queries due to memory errors in the free colab version.

To be comparable, please keep the top 10.000 queries for evaluation.

In [6]:
import json
import re
import unicodedata
import string

input_file = "test.json"
output_file = "test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

Saved 10000 entries to test_queries.json
[
  {
    "query_id": "test_1",
    "question": "How many lots did Thomas Peirce have"
  },
  {
    "query_id": "test_10",
    "question": "Who gave Hamilton the substance of what he had proposed on the part of General Hamilton"
  },
  {
    "query_id": "test_100",
    "question": "Who informs his FRIENDS and the PUBLIC that he has taken that justly celebrated INN in this city"
  }
]


# Create the Qrels for the test set

In [7]:
input_file = "test.json"
qrels_file = "test_qrels.json"
answers_file = "test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

Saved 24084 entries to test_qrels.json
Saved 24084 entries to test_query_answers.json
Sample qrels entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1}
Sample query_answers entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1, 'answer': '183', 'org_answer': '183'}


# Retrieval - Good Luck! (YOUR WORK STARTS HERE !)

In [9]:
import pyterrier as pt
from pyterrier.measures import P, MAP, R, nDCG
from pyterrier_t5 import MonoT5ReRanker
import os
import json
import pandas as pd
import shutil

import numpy as np
import pickle
from gensim.models import KeyedVectors

# Create the Index of the collection



In [8]:
#path index folder
folder_name = "Index"
index_file_name = "terrier_inverted_index"
index_path = os.path.abspath(os.path.join(folder_name, index_file_name))

#init pyTer
if not pt.java.started():
    pt.java.init()

def get_index():
    #check if the index exsists
    properties_file = os.path.join(index_path, "data.properties")
    # Forcing a rebuild by removing the index directory if it exists to ensure updated schema.
    if os.path.exists(index_path):
        print(f"Existing index found at {index_path}. Removing to ensure rebuild with updated schema.")
        shutil.rmtree(index_path)

    print("Index is not found or is being rebuilt, creating a new Index")
    os.makedirs(index_path, exist_ok=True)

    with open('document_collection.json', 'r', encoding='utf-8') as f:
        raw_data = json.load(f)

    # Build Data Frame to include docno, context, raw_ocr, and publication_date
    df = pd.DataFrame(raw_data)
    df = df.rename(columns={"para_id": "docno"})[["docno", "context", "raw_ocr", "publication_date"]]

    # PyTerrier's default text_attrs expects a column named 'text' for indexing content.
    # We map 'context' to 'text' for indexing purposes, but keep 'context' as a separate metadata field.
    df['text'] = df['context'] # Create a 'text' column for the primary indexed field

    # Build the index
    indexer = pt.IterDictIndexer(
        index_path,
        # Store docno, context, raw_ocr, and text (which is context) as metadata
        meta={"docno": 24, "context": 4096, "raw_ocr": 4096, "text": 4096}, # Increased text length for safety
        text_attrs=["text"],           # 'text' (our 'context') is the primary field for indexing content
        meta_reverse=["docno"],        # enable reverse lookup on docno
        pretokenised=False,
        fields=False,
        threads=1,
    )
    index_ref = indexer.index(df.to_dict(orient="records"))
    return pt.IndexFactory.of(index_ref)
index = get_index()

# Print a simple summary
print("Index location:", index_path)
print("Indexed documents:", index.getCollectionStatistics().getNumberOfDocuments())

Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


Existing index found at /content/Index/terrier_inverted_index. Removing to ensure rebuild with updated schema.
Index is not found or is being rebuilt, creating a new Index
16:53:46.347 [ForkJoinPool-1-worker-1] WARN org.terrier.structures.indexing.Indexer -- Indexed 1 empty documents
Index location: /content/Index/terrier_inverted_index
Indexed documents: 131921


In [7]:
# Retrieve collection statistics
stats = index.getCollectionStatistics()

print("Terrier Collection Statistics")
print("--------------------------------")
print(f"Indexed documents:        {stats.getNumberOfDocuments()}")
print(f"Unique terms (vocabulary): {stats.getNumberOfUniqueTerms()}")
print(f"Total tokens:             {stats.getNumberOfTokens()}")
print(f"Average document length:  {stats.getAverageDocumentLength():.2f}")

Terrier Collection Statistics
--------------------------------
Indexed documents:        131921
Unique terms (vocabulary): 236646
Total tokens:             15575099
Average document length:  118.06


# Set up query expansion with histwords (2 mins)

In [9]:
!wget http://snap.stanford.edu/historical_embeddings/eng-fiction-all_sgns.zip
!unzip eng-fiction-all_sgns.zip
!ls -F

--2026-01-08 15:44:22--  http://snap.stanford.edu/historical_embeddings/eng-fiction-all_sgns.zip
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400120997 (382M) [application/zip]
Saving to: ‘eng-fiction-all_sgns.zip’


2026-01-08 15:44:29 (53.2 MB/s) - ‘eng-fiction-all_sgns.zip’ saved [400120997/400120997]

Archive:  eng-fiction-all_sgns.zip
   creating: sgns/
  inflating: sgns/1990-vocab.pkl     
  inflating: sgns/1970-w.npy         
  inflating: sgns/1810-w.npy         
  inflating: sgns/1840-vocab.pkl     
  inflating: sgns/1920-vocab.pkl     
  inflating: sgns/1950-w.npy         
  inflating: sgns/1980-w.npy         
  inflating: sgns/1950-vocab.pkl     
  inflating: sgns/1830-w.npy         
  inflating: sgns/1830-vocab.pkl     
  inflating: sgns/1880-vocab.pkl     
  inflating: sgns/1850-w.npy         
  inflating: sgns/1930-w.n

In [10]:
def load_histwords_colab(year):
    path_vocab = f"sgns/{year}-vocab.pkl"
    path_vectors = f"sgns/{year}-w.npy"

    print(f"Loading {year} data...")

    with open(path_vocab, 'rb') as f:
        vocab = pickle.load(f)

    vectors = np.load(path_vectors)

    kv = KeyedVectors(vector_size=vectors.shape[1])
    kv.add_vectors(vocab, vectors)

    print(f"Success! Model for {year} loaded with {len(vocab)} words.")
    return kv

# Charger la décennie souhaitée
hist_model = load_histwords_colab(1850)

Loading 1850 data...
Success! Model for 1850 loaded with 100000 words.


In [11]:
word = "gay"
if word in hist_model:
    print(f"Synonymes en 1850 pour '{word}':")
    print(hist_model.most_similar(word, topn=5))

Synonymes en 1850 pour 'gay':
[('brilliant', 0.7579882144927979), ('lively', 0.7548628449440002), ('graceful', 0.6951406598091125), ('cheerful', 0.6921846866607666), ('elegant', 0.6730675101280212)]


  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


In [12]:
def hist_expansion_logic(row):
  n_words = 3
  confidence=0.5
  original_query = row['query']
  words = original_query.lower().replace('?', '').split()
  expanded_terms = list(words)

  for word in words:
    if word in hist_model:
      similars = hist_model.most_similar(word, topn=n_words)
      for sim_word, score in similars:
        if score > confidence:
          if sim_word not in expanded_terms:
            expanded_terms.append(sim_word)

    return " ".join(expanded_terms)

hist_qe = pt.apply.query(hist_expansion_logic)

# 1st stage retrieval (skeleton code)

In [26]:
# First stage retriver with BM25
bm25_1st_stage_retrieval = pt.terrier.Retriever(index, wmodel="BM25")%50
tf_idf_1st_stage_retrieval = pt.terrier.Retriever(index, wmodel="TF_IDF")%50
bm25_hist_qe_1st_stage_retrieval = hist_qe >> bm25_1st_stage_retrieval

In [14]:
#testing the engine
query = "Europe"
bm25_results = bm25_1st_stage_retrieval.search(query)
tf_idf_results = tf_idf_1st_stage_retrieval.search(query)
bm25_hist_qe_results = bm25_hist_qe_1st_stage_retrieval.search(query)
#show first 10 results
print('-------------------- bm25 results (top 100) --------------------\n',bm25_results.head(10))
print('-------------------- tf_idf results (top 100) --------------------\n',tf_idf_results.head(10))
print('-------------------- bm25_hist_qe results (top 100) --------------------\n',bm25_hist_qe_results.head(10))

# Convert queries and qrels to pandas DataFrames
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

# Rename columns to match PyTerrier's expected names for evaluation
queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

# WARNING taking the whole queries will take a long time
# --- Use a subset of queries for faster computation,  ---
num_queries_subset = 100 # You can adjust this number

queries_subset_df = queries_df.head(num_queries_subset)
qrels_subset_df = qrels_df[qrels_df['qid'].isin(queries_subset_df['qid'])].copy()

experiment_results = pt.Experiment(
    [bm25_1st_stage_retrieval, tf_idf_1st_stage_retrieval, bm25_hist_qe_1st_stage_retrieval],
    queries_subset_df, # Use the subset of queries
    qrels_subset_df,   # Use the filtered qrels for the subset
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["BM25","TF-IDF","BM25 Query Expansion histwords"]
)

display(experiment_results)

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]


-------------------- bm25 results (top 100) --------------------
   qid   docid                     docno  rank      score   query
0   1  122024      New_York_19190610_27     0  10.233243  Europe
1   1   61883         Texas_18771215_13     1   9.759719  Europe
2   1    9637      Maryland_18401110_20     2   9.730148  Europe
3   1  110750      Nebraska_19020225_26     3   9.715430  Europe
4   1   36539  Rhode_Island_18511030_25     4   9.642500  Europe
5   1   47808      Illinois_18680907_40     5   9.578485  Europe
6   1  100207         Kansas_18970902_5     6   9.570657  Europe
7   1   65964     Tennessee_18740915_14     7   9.556417  Europe
8   1   29371        Hawaii_18520522_33     8   9.509314  Europe
9   1   38017     Tennessee_18590726_13     9   9.499877  Europe
-------------------- tf_idf results (top 100) --------------------
   qid   docid                     docno  rank     score   query
0   1  122024      New_York_19190610_27     0  5.641964  Europe
1   1   61883         T

  dists = dot(self.vectors[clip_start:clip_end], mean) / self.norms[clip_start:clip_end]
  result = self.vectors[index] / self.norms[index]


Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,BM25,0.571993,0.52,0.128,0.071,0.64,0.71,0.578709,0.602405
1,TF-IDF,0.576489,0.52,0.126,0.07,0.63,0.7,0.579284,0.602801
2,BM25 Query Expansion histwords,0.530604,0.49,0.114,0.062,0.57,0.62,0.531487,0.547726


In [None]:
import pyterrier as pt

# Let's take the BM25 results for the query "Europe" as an example
# (Assuming bm25_results DataFrame is available from earlier execution)

# Retrieve the 'context' (cleaned text) for the top results
results_with_context = pt.text.get_text(index, 'context')(bm25_results)
print("\n--- BM25 Results with 'context' (cleaned text) ---\n")
display(results_with_context[['docno', 'score', 'context']].head())

# Retrieve the 'raw_ocr' for the top results
results_with_raw_ocr = pt.text.get_text(index, 'raw_ocr')(bm25_results)
print("\n--- BM25 Results with 'raw_ocr' ---\n")
display(results_with_raw_ocr[['docno', 'score', 'raw_ocr']].head())

MESSAGE: THE CODE BELOW IS SUPPOSED TO RETRIEVE THE WHOLE QUERIES BUT BECAUSE OF COMPUTATION SPEED, IT DIDN'T GIVE A RESULT IN A REASONABLE TIME

In [None]:
"""
experiment_results = pt.Experiment(
    [bm25_1st_stage_retrieval, tf_idf_1st_stage_retrieval, bm25_hist_qe_1st_stage_retrieval],
    queries_df, # Use all the queries
    qrels_df,   # Use all the qrels
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["BM25","TF-IDF","BM25 Query Expansion histwords"]
)
"""

DISPLAY and SAVE the results to avoid 22 minutes of waiting

In [None]:
display(experiment_results)
experiment_results.to_csv("experiment_results_all_queries.csv", index=False)

# 2nd stage retrieval (ISSUE, haven't done yet ! Lucas works on it !)

In [25]:
# Initialize MonoT5ReRanker
monoT5_reranker = MonoT5ReRanker(batch_size=8)

monoT5_pipeline = bm25_1st_stage_retrieval >> pt.text.get_text(index, 'text') >> monoT5_reranker

query = "Europe"
monoT5_results = monoT5_pipeline.search(query)
print('-------------------- MonoT5 Re-ranking results (top 10) --------------------\n', monoT5_results.head(10))

monoT5:   0%|          | 0/125 [00:00<?, ?batches/s]

-------------------- MonoT5 Re-ranking results (top 10) --------------------
     qid   docid                    docno   query  \
389   1   21910  District_of_Columbia_18  Europe   
27    1    8116        Hawaii_18401205_5  Europe   
115   1   31296      Nebraska_18570409_9  Europe   
693   1  110274    Minnesota_19020810_22  Europe   
23    1   21894  District_of_Columbia_18  Europe   
663   1     981      Virginia_18081004_8  Europe   
226   1  105741         Iowa_19021205_10  Europe   
428   1   71455    Louisiana_18800722_19  Europe   
100   1  130103  North_Carolina_18230319  Europe   
212   1   54439    Tennessee_18701104_16  Europe   

                                                  text     score  rank  
389  Prussia is gradually settling her already made... -0.364338     0  
27   The emancipation of Europe from the detestable... -0.729331     1  
115  Public opinion waits for the first acts of the... -0.769288     2  
693  In the far East, in the capital of Korea, the ... -1

In [16]:
# Prepare queries_df and qrels_df as before (assuming 'queries' and 'qrels' are defined globally)
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

queries_df = queries_df.rename(columns={"query_id": "qid", "question": "query"})
qrels_df = qrels_df.rename(columns={"query_id": "qid", "para_id": "docno"})

num_queries_subset = 100

queries_subset_df = queries_df.head(num_queries_subset)
qrels_subset_df = qrels_df[qrels_df['qid'].isin(queries_subset_df['qid'])].copy()

experiment_results_monoT5 = pt.Experiment(
    [monoT5_pipeline],
    queries_subset_df, # Use the subset of queries
    qrels_subset_df,   # Use the filtered qrels for the subset
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["BM25_MonoT5_Reranked"]
)

display(experiment_results_monoT5)

monoT5:   0%|          | 0/625 [00:00<?, ?batches/s]

Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,BM25_MonoT5_Reranked,0.692594,0.67,0.144,0.072,0.72,0.72,0.697619,0.697619


In [19]:
"""experiment_results_monoT5_E1 = pt.Experiment(
    [monoT5_pipeline],
    queries_df, # Use the subset of queries
    qrels_df,   # Use the filtered qrels for the subset
    [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, MAP],
    names=["BM25_MonoT5_Reranked"]
)

display(experiment_results_monoT5_E1)
experiment_results_monoT5_E1.to_csv("experiment_results_all_queries.csv", index=False)"""

monoT5:   0%|          | 0/62487 [00:00<?, ?batches/s]

KeyboardInterrupt: 

### Using ASPIRE (TEMPORARY)


PyTerrier : https://pyterrier.readthedocs.io/en/latest/experiments.html

ASPIRE https://github.com/GiorgosPeikos/ASPIRE

ONLINE: https://aspire-ir-eval.streamlit.app/



Remember the res files are actually the rankings created by our retrieval model!

We downloading it to use another tool for data analysis. The tool is called ASPIRE.

ASPIRE requires a different input format!

In [None]:
import csv
import pandas as pd # Ensure pandas is imported
import xml.etree.ElementTree as ET
from xml.dom import minidom

# ----- RESULTS (TF-IDF, BM25, etc.) -----
def save_res(df, run_name, out_path, sep="\t"):
    res = df.copy()  # keep original untouched
    if "docid" in res.columns and "docno" not in res.columns:
        res = res.rename(columns={"docid": "docno"})  # normalize doc id
    # The 'query' column is often added by pyterrier for inspection but not needed in output
    res = res.drop(columns=["query", "query_0"], errors="ignore") # remove query text, keep qid
    res["iteration"] = "Q0"  # fixed iteration value
    res["experiment_id"] = run_name  # label for the run
    res = res.rename(columns={"qid": "query_id", "docno": "doc_id"})  # final names
    # Ensure score is numeric for sorting, if needed, though pyterrier usually keeps it
    res["score"] = pd.to_numeric(res["score"])
    # Sort by score for ranking within each query_id
    res = res.sort_values(by=["query_id", "score"], ascending=[True, False])
    res["rank"] = res.groupby("query_id")["score"].rank(ascending=False, method="first").astype(int)
    res = res[["query_id", "iteration", "doc_id", "rank", "score", "experiment_id"]]  # correct column order
    res.to_csv(out_path, sep=sep, index=False, header=False)  # write file


# Generate full retrieval results for all queries
# Assuming 'bm25', 'tf_idf', 'bm25_hist_qe', 'queries_df' are available from previous cells
full_bm25_results = bm25.transform(queries_df)
full_tf_idf_results = tf_idf.transform(queries_df)
full_bm25_hist_qe_results = bm25_hist_qe.transform(queries_df)

# Save the results to CSV files for ASPIRE
save_res(full_bm25_results, "BM25", "results_bm25.csv")
save_res(full_tf_idf_results, "TF-IDF", "results_tf_idf.csv")
save_res(full_bm25_hist_qe_results, "BM25_HistQE", "results_bm25_hist_qe.csv")

print("Generated results CSVs for BM25, TF-IDF, and BM25 with historical QE.")

# ----- QUERIES -----
# root with task attribute
# Renaming 'qid' to 'query_id' for consistency if needed, but XML expects 'number' attribute
root = ET.Element("topics", attrib={"task": "Chronicling America QA"})  # create root element

# add one <topic number=\"...\"> per row
for _, row in queries_df.iterrows():  # iterate over dataframe rows (using queries_df)
    topic = ET.SubElement(root, "topic", attrib={"number": str(row["qid"])})  # topic node
    topic.text = str(row["query"])  # topic content

# pretty print
rough_xml = ET.tostring(root, encoding="utf-8")  # raw bytes
pretty_xml = minidom.parseString(rough_xml).toprettyxml(indent="  ", encoding="utf-8")  # formatted bytes

# write to file
with open("queries.xml", "wb") as f:  # save xml with declaration
    f.write(pretty_xml)

print("Generated queries.xml file.")

# ----- QRELS -----
qrels_out = qrels_df.copy() # Use qrels_df which is available in kernel state
qrels_out = qrels_out.drop(columns=["iteration"], errors="ignore") # iteration from qrels_df is int, ASPIRE wants string 'Q0'
if "docid" in qrels_out.columns and "docno" not in qrels_out.columns:
    qrels_out = qrels_out.rename(columns={"docid": "docno"})
qrels_out = qrels_out.rename(columns={"qid": "query_id", "docno": "doc_id"})
qrels_out["iteration"] = "0" # Ensure '0' string for ASPIRE format
qrels_out = qrels_out[["query_id", "iteration", "doc_id", "relevance"]]
qrels_out.to_csv("qrels.csv", sep=" ", index=False, header=False)

print("Generated qrels.csv file.")

print("All files generated for ASPIRE compatibility.")

In [None]:
# To use ASPIRE, we need to download the res files, the qrels and queries in the
# required dataformat and use the interface or use it locally.

# Res file Expected format: query_id, iteration (i.e. Q0), doc_id, rank, score, experiment_id
# Queries Expected format: query_id, query
# Qrels Expected format: query_id, iteration (i.e. Q0), doc_id, relevance (without header row)

# DEMO TIME!