In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.metrics import precision_score
from collections import Counter

# NLTK downloads
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
try:
    df = pd.read_csv('cleaned_prediction_dates.csv')
except FileNotFoundError:
    print("Error: 'output.csv' not found. Please check the file path.")
    exit(1)

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    try:
        tokens = word_tokenize(text.lower())
        processed_tokens = []
        for token in tokens:
            if token in stop_words:
                continue
            if re.match(r'^\d{4}$', token) or re.match(r'^\d+(st|nd|rd|th)$', token):
                processed_tokens.append(token)
            elif token.isalpha():
                processed_tokens.append(ps.stem(token))
        return processed_tokens
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return []

In [3]:
df['processed_text'] = df['text'].apply(preprocess_text)

# Debug: Inspect data
print("\nData Overview:")
print(df[['topic_id', 'result_id', 'rel']].head(10))
print("\nRel column distribution:")
print(df['rel'].value_counts(dropna=False))


Data Overview:
                                            topic_id  \
0  https://guides.loc.gov/chronicling-america-190...   
1  https://guides.loc.gov/chronicling-america-190...   
2  https://guides.loc.gov/chronicling-america-190...   
3  https://guides.loc.gov/chronicling-america-190...   
4  https://guides.loc.gov/chronicling-america-190...   
5  https://guides.loc.gov/chronicling-america-190...   
6  https://guides.loc.gov/chronicling-america-190...   
7  https://guides.loc.gov/chronicling-america-190...   
8  https://guides.loc.gov/chronicling-america-190...   
9  https://guides.loc.gov/chronicling-america-190...   

                                           result_id  rel  
0  https://www.loc.gov/resource/sn87093407/1904-0...    1  
1  https://www.loc.gov/resource/sn99063957/1904-0...    1  
2  https://www.loc.gov/resource/sn85052116/1904-0...    1  
3  https://www.loc.gov/resource/sn99063957/1904-0...    1  
4  https://www.loc.gov/resource/sn85066387/1904-0...    1  
5  http

In [4]:
# Organize relevance judgments
rel_judgments = df.groupby('topic_id')[['result_id', 'rel']].apply(
    lambda x: list(zip(x['result_id'].astype(str).str.strip(), x['rel'].astype(int)))
).to_dict()

print("\nRelevance Judgments:")
for topic, judgments in rel_judgments.items():
    relevant_count = sum(1 for _, rel in judgments if rel == 1)
    print(f"Topic: {topic}, Judgments: {len(judgments)}, Relevant: {relevant_count}")
    if judgments:
        print(f"Sample result_ids: {[res_id for res_id, _ in judgments[:2]]}")


Relevance Judgments:
Topic: https://guides.loc.gov/chronicling-america-16th-amendment, Judgments: 15, Relevant: 10
Sample result_ids: ['https://www.loc.gov/resource/sn83040198/1909-10-01/ed-1/?sp=5&q=sixteenth+amendment', 'https://www.loc.gov/resource/sn84020558/1912-03-21/ed-1/?sp=1&q=sixteeth+amendment+income+tax']
Topic: https://guides.loc.gov/chronicling-america-1904-presidential-election, Judgments: 15, Relevant: 9
Sample result_ids: ['https://www.loc.gov/resource/sn87093407/1904-06-21/ed-1/?sp=1&q=Republican+Convention+Theodore+Roosevelt&st=text&r=-0.06,0.171,0.632,0.632,0', 'https://www.loc.gov/resource/sn99063957/1904-06-21/ed-1/?sp=1&q=Republican+National+Convention+Vice+President+Fairbanks+Roosevelt+Delegations']
Topic: https://guides.loc.gov/chronicling-america-19th-amendment, Judgments: 12, Relevant: 5
Sample result_ids: ['https://www.loc.gov/resource/sn85058130/1890-03-27/ed-1/?sp=1&q=Discussing+suffrage', 'https://www.loc.gov/resource/sn85066387/1895-05-21/ed-1/?sp=4&q=W

In [5]:
# Initialize BM25
corpus = df['processed_text'].tolist()
result_ids = df['result_id'].astype(str).str.strip().tolist()
bm25 = BM25Okapi(corpus)

def retrieve_bm25(query, top_k=10):
    query_tokens = preprocess_text(query)
    if not query_tokens:
        return []
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return [
        (result_ids[idx], scores[idx], corpus[idx])
        for idx in top_indices
    ]

def pseudo_relevance_feedback(query, initial_results, top_k=10, feedback_docs=5, terms_to_add=5):
    """
    Apply PRF by extracting terms from top feedback_docs and expanding the query.
    """
    if not initial_results:
        return initial_results
    # Get top feedback_docs
    feedback_indices = [idx for idx, _ in enumerate(initial_results[:feedback_docs])]
    # Collect all tokens from these documents
    feedback_tokens = []
    for _, _, doc_tokens in initial_results[:feedback_docs]:
        feedback_tokens.extend(doc_tokens)
    # Count term frequencies
    term_counts = Counter(feedback_tokens)
    # Exclude original query tokens to avoid redundancy
    query_tokens = preprocess_text(query)
    new_terms = [
        term for term, _ in term_counts.most_common()
        if term not in query_tokens
    ][:terms_to_add]
    # Expand query
    expanded_query = query + ' ' + ' '.join(new_terms)
    print(f"PRF Expanded Query: {expanded_query}")
    # Re-run retrieval
    return retrieve_bm25(expanded_query, top_k)

def expand_query(query, topic_id):
    """
    Explicit query expansion with topic-specific terms.
    """
    expansions = {
        'chinese-exclusion-act': '1882 immigration bill',
        'mona-lisa': 'leonardo painting',
        'statue-of-liberty': '1886 france bartholdi',
        'mothers-day': '1910 wilson',
        'electric-chair': '1890 execution',
        'yoga': 'meditation exercise',
        'motorcycle-mania': 'bike race',
        'female-pilots': 'aviation women',
        'league-of-nations': '1919 wilson treaty',
        'ping-pong-craze': 'table tennis',
        'ouija-board': 'spiritualism board'
    }
    return query + ' ' + expansions.get(topic_id, '')

def evaluate_retrieval(query, retrieved, rel_judgments, k=10):
    topic_id = query.replace(' ', '-')
    relevant = set(res_id for res_id, rel in rel_judgments.get(topic_id, []) if rel == 1)
    print(f"\nTopic: {topic_id}, Relevant result_ids: {relevant}")
    if not relevant:
        print(f"No relevant documents for {topic_id}")
        return 0.0, 0.0
    retrieved_k = [res_id for res_id, _, _ in retrieved[:k]]
    print(f"Retrieved result_ids: {retrieved_k}")
    y_true = [1 if res_id in relevant else 0 for res_id in retrieved_k]
    y_pred = [1] * len(retrieved_k)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = sum(y_true) / len(relevant) if relevant else 0.0
    print(f"Matches: {sum(y_true)} out of {len(relevant)} relevant")
    return precision, recall

# Main evaluation: PRF with and without query expansion
topics = df['topic_id'].unique().tolist()
results = {
    'baseline': {'precisions': [], 'recalls': []},
    'prf': {'precisions': [], 'recalls': []},
    'prf_with_expansion': {'precisions': [], 'recalls': []}
}

for topic in topics:
    query = topic.replace('-', ' ')
    topic_id = topic
    print(f"\n\nEvaluating Topic: {topic}")

    # 1. Baseline BM25
    print("\n--- Baseline BM25 ---")
    baseline_results = retrieve_bm25(query, top_k=10)
    p_base, r_base = evaluate_retrieval(query, baseline_results, rel_judgments, k=10)
    results['baseline']['precisions'].append(p_base)
    results['baseline']['recalls'].append(r_base)
    print(f"Baseline Precision@10: {p_base:.3f}, Recall@10: {r_base:.3f}")

    # 2. PRF without query expansion
    print("\n--- PRF without Query Expansion ---")
    prf_results = pseudo_relevance_feedback(query, baseline_results, top_k=10, feedback_docs=5, terms_to_add=5)
    p_prf, r_prf = evaluate_retrieval(query, prf_results, rel_judgments, k=10)
    results['prf']['precisions'].append(p_prf)
    results['prf']['recalls'].append(r_prf)
    print(f"PRF Precision@10: {p_prf:.3f}, Recall@10: {r_prf:.3f}")

    # 3. PRF with query expansion
    print("\n--- PRF with Query Expansion ---")
    expanded_query = expand_query(query, topic_id)
    print(f"Initial Expanded Query: {expanded_query}")
    initial_results = retrieve_bm25(expanded_query, top_k=10)
    prf_exp_results = pseudo_relevance_feedback(expanded_query, initial_results, top_k=10, feedback_docs=5, terms_to_add=5)
    p_prf_exp, r_prf_exp = evaluate_retrieval(query, prf_exp_results, rel_judgments, k=10)
    results['prf_with_expansion']['precisions'].append(p_prf_exp)
    results['prf_with_expansion']['recalls'].append(r_prf_exp)
    print(f"PRF+Expansion Precision@10: {p_prf_exp:.3f}, Recall@10: {r_prf_exp:.3f}")

# Summarize results
print("\n\nFinal Results:")
for method in results:
    avg_p = np.mean(results[method]['precisions'])
    avg_r = np.mean(results[method]['recalls'])
    print(f"{method.capitalize()} - Avg Precision@10: {avg_p:.3f}, Avg Recall@10: {avg_r:.3f}")




Evaluating Topic: https://guides.loc.gov/chronicling-america-1904-presidential-election

--- Baseline BM25 ---

Topic: https://guides.loc.gov/chronicling-america-1904-presidential-election, Relevant result_ids: {'https://www.loc.gov/resource/sn87093407/1904-06-21/ed-1/?sp=1&q=Republican+Convention+Theodore+Roosevelt&st=text&r=-0.06,0.171,0.632,0.632,0', 'https://www.loc.gov/resource/sn99063957/1904-06-21/ed-1/?sp=1&q=Republican+National+Convention+Vice+President+Fairbanks+Roosevelt+Delegations', 'https://www.loc.gov/resource/sn82014248/1904-09-05/ed-1/?sp=4&q=roosevelt+elected&st=text&r=0.327,0.385,0.41,1.053,0', 'https://www.loc.gov/resource/sn85066387/1904-06-24/ed-1/?sp=1&q=Republican+Convention+Roosevelt+Fairbanks+Running+Mate+President#', 'https://www.loc.gov/resource/sn85052116/1904-06-23/ed-1/?sp=1&q=Republican+Republicans+Convention+Roosevelt+Fairbanks', 'https://www.loc.gov/resource/sn83030214/1904-06-24/ed-1/?sp=1&q=Roosevelt+Fairbanks+Republican+Convention+Vice+President',

In [6]:
output = []
for topic in topics:
    query = topic.replace('-', ' ')
    for method in ['baseline', 'prf', 'prf_with_expansion']:
        if method == 'baseline':
            res = retrieve_bm25(query, top_k=10)
        elif method == 'prf':
            res = pseudo_relevance_feedback(query, retrieve_bm25(query, top_k=10), top_k=10)
        else:
            expanded_query = expand_query(query, topic)
            res = pseudo_relevance_feedback(expanded_query, retrieve_bm25(expanded_query, top_k=10), top_k=10)
        for result_id, score, _ in res:
            output.append({
                'topic_id': topic,
                'method': method,
                'result_id': result_id,
                'score': score
            })
output_df = pd.DataFrame(output)
output_df.to_csv('retrieval_results_prf.csv', index=False)
print("\nResults saved to 'retrieval_results_prf.csv'")

PRF Expanded Query: https://guides.loc.gov/chronicling america 1904 presidential election state amend nation june vote
PRF Expanded Query: https://guides.loc.gov/chronicling america 1904 presidential election  state amend nation june vote
PRF Expanded Query: https://guides.loc.gov/chronicling america early women aviation/selected articles shall leagu state council execut
PRF Expanded Query: https://guides.loc.gov/chronicling america early women aviation/selected articles  shall leagu state council execut
PRF Expanded Query: https://guides.loc.gov/chronicling america clara h barton work cross war red ot
PRF Expanded Query: https://guides.loc.gov/chronicling america clara h barton  work cross war red ot
PRF Expanded Query: https://guides.loc.gov/chronicling america boston subway tho street tunnel car station
PRF Expanded Query: https://guides.loc.gov/chronicling america boston subway  tho street tunnel car station
PRF Expanded Query: https://guides.loc.gov/chronicling america building ti

In [7]:
df

Unnamed: 0,topic_id,result_id,rel,text,time,date,event,predicted_Date,predicted_date,extracted_dates,processed_text
0,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn87093407/1904-0...,1,REPUBLICAN CONVENTION GETS DOWN TO BUSINESS O....,"June 21, 1904 : The Republican National Conven...",21-06-1904,The Republican National Convention begins in C...,,The exact date mentioned in the text is:\r\n\r...,['June 11'],"[republican, convent, get, busi, leader, organ..."
1,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,NATIONAL REPUBLICAN CONVENTION Fairbanks for V...,"June 21, 1904 : The Republican National Conven...",21-06-1904,The Republican National Convention begins in C...,,June 21,['June 21'],"[nation, republican, convent, fairbank, vice, ..."
2,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85052116/1904-0...,1,Convention Hall Chicago Juno 23 This the third...,"June 21, 1904 : The Republican National Conven...",21-06-1904,The Republican National Convention begins in C...,,"June 23, 1904","['June 23', '1904']","[convent, hall, chicago, juno, third, sad, hut..."
3,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn99063957/1904-0...,1,ROOSEVELT AND FAIRBANKS Republican National Co...,"June 23, 1904 : Theodore Roosevelt is nominate...",23-06-1904,Theodore Roosevelt is nominated for the Presid...,,"June 20, 1904","['June 20', '1904']","[roosevelt, fairbank, republican, nation, conv..."
4,https://guides.loc.gov/chronicling-america-190...,https://www.loc.gov/resource/sn85066387/1904-0...,1,ROOSEVELT WILL LEAD PARTY TO VICTORY; HIS RUNN...,"June 23, 1904 : Charles Fairbanks is named as ...",23-06-1904,Charles Fairbanks is named as his Vice Preside...,,The exact date of the major event being discus...,['June 13'],"[roosevelt, lead, parti, victori, run, mate, p..."
...,...,...,...,...,...,...,...,...,...,...,...
434,https://guides.loc.gov/chronicling-america-yoga,https://chroniclingamerica.loc.gov/lccn/sn8506...,1,A PAGE HOW YOUNG GIRLS STUDY THE FOR HINDOO ME...,MISSING,,,,No timeline found.,[],"[page, young, girl, studi, hindoo, method, res..."
435,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn83016209/1961-0...,1,POsition Is Everything in Life to Yogi Practit...,MISSING,,,,2000 years,['2000'],"[posit, everyth, life, yogi, practition, tiwar..."
436,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn86002403/1951-1...,1,¿Qué Es El Yoga? Yoga es una palabra sánscrita...,MISSING,,,,No timeline found.,[],"[es, el, yoga, yoga, es, una, palabra, sánscri..."
437,https://guides.loc.gov/chronicling-america-yoga,https://www.loc.gov/resource/sn82001257/1961-0...,1,Con la Práctica del Yoga se Puede Vivir Muchos...,MISSING,,,,"2,000 años",[],"[con, la, práctica, del, yoga, se, pued, vivir..."


In [8]:
import ast

In [9]:
ast.literal_eval(df['extracted_dates'][0])

['June 11']

In [10]:
time = []
for i in range(0, len(output_df)):
    topic_id = output_df['topic_id'][i]
    cur_time = ""
    for j in range(0, len(df)):
        if (topic_id) == df['topic_id'][j]:
            cur_time_list = ast.literal_eval(df['extracted_dates'][j])
            if (cur_time_list != []):
                cur_time = cur_time_list[0]
            else:
                cur_time = "No timeline found"
            break
    time.append(cur_time)

In [11]:
query

'https://guides.loc.gov/chronicling america yoga'

In [12]:
output_df['predicted_time'] = time

In [13]:
categories = output_df['predicted_time'].unique()

In [14]:
output_df_sorted = output_df.sort_values(by = 'score', ascending = False).reset_index().drop('index', axis = 1)

In [15]:
output_df_sorted

Unnamed: 0,topic_id,method,result_id,score,predicted_time
0,https://guides.loc.gov/chronicling-america-pin...,prf,https://www.loc.gov/resource/sn82016357/1902-0...,47.641696,No timeline found
1,https://guides.loc.gov/chronicling-america-pin...,prf_with_expansion,https://www.loc.gov/resource/sn82016357/1902-0...,47.641696,No timeline found
2,https://guides.loc.gov/chronicling-america-the...,prf_with_expansion,https://chroniclingamerica.loc.gov/lccn/sn8304...,42.867909,No timeline found
3,https://guides.loc.gov/chronicling-america-the...,prf,https://chroniclingamerica.loc.gov/lccn/sn8304...,42.867909,No timeline found
4,https://guides.loc.gov/chronicling-america-the...,prf_with_expansion,https://chroniclingamerica.loc.gov/lccn/sn9902...,42.256602,No timeline found
...,...,...,...,...,...
955,https://guides.loc.gov/chronicling-america-wwi...,baseline,https://www.loc.gov/resource/sn86099906/1903-0...,4.116640,No timeline found
956,https://guides.loc.gov/chronicling-america-wwi...,baseline,https://www.loc.gov/resource/sn88085620/1910-1...,3.862804,No timeline found
957,https://guides.loc.gov/chronicling-america-har...,baseline,https://www.loc.gov/resource/sn82014424/1881-1...,3.502106,No timeline found
958,https://guides.loc.gov/chronicling-america-yoga,baseline,https://www.loc.gov/resource/sn82014424/1881-1...,3.502106,No timeline found


In [16]:
categories_dict = {}
for i in (output_df_sorted['predicted_time'].value_counts()).keys():
    categories_dict[i] = 0

In [17]:
results = categories_dict.copy()
for i in range(0, len(output_df_sorted)):
    if output_df_sorted['predicted_time'][i] == "No timeline found":
        time = "No timeline found" 
    else:
        time = (output_df_sorted['predicted_time'][i])  
    time = str(time)
    if (categories_dict[time] < 5): #### THIS CAN BE CHANGED TO REFLECT THE NUMBER OF RESULTS WE WANT TO SEE
        if (results[time] == 0):
            results[time] = [output_df_sorted['result_id'][i]]
        else:
            results[time].append(output_df_sorted['result_id'][i])
        categories_dict[time] += 1

In [18]:
categories_dict

{'No timeline found': 5,
 '1902': 5,
 'June 21': 5,
 '1793': 5,
 '1334': 5,
 'July 1882': 5,
 '1870': 5,
 '1932': 5,
 '1876': 5,
 '1879': 5,
 '1917': 5,
 'March 31': 5,
 'June 11': 5,
 'May 30': 5,
 '1904': 5}

In [77]:
results

{'No timeline found': ['https://www.loc.gov/resource/sn82016357/1902-05-03/ed-1/?sp=7&q=PING-PONG+ping+pong+craze&r=-1.345,-0.063,3.69,1.674,0',
  'https://www.loc.gov/resource/sn82016357/1902-05-03/ed-1/?sp=7&q=PING-PONG+ping+pong+craze&r=-1.345,-0.063,3.69,1.674,0',
  'https://chroniclingamerica.loc.gov/lccn/sn83045462/1914-03-20/ed-1/seq-10/#words=LISA+Lisa+Mona+MONA',
  'https://chroniclingamerica.loc.gov/lccn/sn83045462/1914-03-20/ed-1/seq-10/#words=LISA+Lisa+Mona+MONA',
  'https://chroniclingamerica.loc.gov/lccn/sn99021999/1913-12-14/ed-1/seq-4/#words=Lisa+Mona'],
 '1902': ['https://www.loc.gov/resource/sn86063381/1921-04-05/ed-1/?sp=2&q=Curie+Marie+radium',
  'https://www.loc.gov/resource/sn86063381/1921-04-05/ed-1/?sp=2&q=Curie+Marie+radium',
  'https://www.loc.gov/resource/sn84026749/1921-03-13/ed-1/?sp=1&q=Curie+Marie',
  'https://www.loc.gov/resource/sn84026749/1921-03-13/ed-1/?sp=1&q=Curie+Marie',
  'https://www.loc.gov/resource/sn85066387/1911-11-08/ed-1/?sp=1&q=Curie+Mari