In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.metrics import precision_score
from collections import Counter
import ast

# NLTK downloads
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
try:
    df = pd.read_csv('cleaned_prediction_dates.csv')
except FileNotFoundError:
    print("Error: 'output.csv' not found. Please check the file path.")
    exit(1)

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    try:
        tokens = word_tokenize(text.lower())
        processed_tokens = []
        for token in tokens:
            if token in stop_words:
                continue
            if re.match(r'^\d{4}$', token) or re.match(r'^\d+(st|nd|rd|th)$', token):
                processed_tokens.append(token)
            elif token.isalpha():
                processed_tokens.append(ps.stem(token))
        return processed_tokens
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return []

In [3]:
df['processed_text'] = df['text'].apply(preprocess_text)

# Debug: Inspect data
print("\nData Overview:")
print(df[['topic_id', 'result_id', 'rel']].head(10))
print("\nRel column distribution:")
print(df['rel'].value_counts(dropna=False))


Data Overview:
                                            topic_id  \
0  https://guides.loc.gov/chronicling-america-190...   
1  https://guides.loc.gov/chronicling-america-190...   
2  https://guides.loc.gov/chronicling-america-190...   
3  https://guides.loc.gov/chronicling-america-190...   
4  https://guides.loc.gov/chronicling-america-190...   
5  https://guides.loc.gov/chronicling-america-190...   
6  https://guides.loc.gov/chronicling-america-190...   
7  https://guides.loc.gov/chronicling-america-190...   
8  https://guides.loc.gov/chronicling-america-190...   
9  https://guides.loc.gov/chronicling-america-190...   

                                           result_id  rel  
0  https://www.loc.gov/resource/sn87093407/1904-0...    1  
1  https://www.loc.gov/resource/sn99063957/1904-0...    1  
2  https://www.loc.gov/resource/sn85052116/1904-0...    1  
3  https://www.loc.gov/resource/sn99063957/1904-0...    1  
4  https://www.loc.gov/resource/sn85066387/1904-0...    1  
5  http

In [4]:
# Organize relevance judgments
rel_judgments = df.groupby('topic_id')[['result_id', 'rel']].apply(
    lambda x: list(zip(x['result_id'].astype(str).str.strip(), x['rel'].astype(int)))
).to_dict()

print("\nRelevance Judgments:")
for topic, judgments in rel_judgments.items():
    relevant_count = sum(1 for _, rel in judgments if rel == 1)
    print(f"Topic: {topic}, Judgments: {len(judgments)}, Relevant: {relevant_count}")
    if judgments:
        print(f"Sample result_ids: {[res_id for res_id, _ in judgments[:2]]}")


Relevance Judgments:
Topic: https://guides.loc.gov/chronicling-america-16th-amendment, Judgments: 15, Relevant: 10
Sample result_ids: ['https://www.loc.gov/resource/sn83040198/1909-10-01/ed-1/?sp=5&q=sixteenth+amendment', 'https://www.loc.gov/resource/sn84020558/1912-03-21/ed-1/?sp=1&q=sixteeth+amendment+income+tax']
Topic: https://guides.loc.gov/chronicling-america-1904-presidential-election, Judgments: 15, Relevant: 9
Sample result_ids: ['https://www.loc.gov/resource/sn87093407/1904-06-21/ed-1/?sp=1&q=Republican+Convention+Theodore+Roosevelt&st=text&r=-0.06,0.171,0.632,0.632,0', 'https://www.loc.gov/resource/sn99063957/1904-06-21/ed-1/?sp=1&q=Republican+National+Convention+Vice+President+Fairbanks+Roosevelt+Delegations']
Topic: https://guides.loc.gov/chronicling-america-19th-amendment, Judgments: 12, Relevant: 5
Sample result_ids: ['https://www.loc.gov/resource/sn85058130/1890-03-27/ed-1/?sp=1&q=Discussing+suffrage', 'https://www.loc.gov/resource/sn85066387/1895-05-21/ed-1/?sp=4&q=W

In [26]:
# Initialize BM25
corpus = df['processed_text'].tolist()
result_ids = df['result_id'].astype(str).str.strip().tolist()
bm25 = BM25Okapi(corpus)

def retrieve_bm25(query, top_k=10):
    query_tokens = preprocess_text(query)
    if not query_tokens:
        return []
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return [
        (result_ids[idx], scores[idx], corpus[idx])
        for idx in top_indices
    ]

def pseudo_relevance_feedback(query, initial_results, top_k=10, feedback_docs=5, terms_to_add=5):
    """
    Apply PRF by extracting terms from top feedback_docs and expanding the query.
    """
    if not initial_results:
        return initial_results
    # Get top feedback_docs
    feedback_indices = [idx for idx, _ in enumerate(initial_results[:feedback_docs])]
    # Collect all tokens from these documents
    feedback_tokens = []
    for _, _, doc_tokens in initial_results[:feedback_docs]:
        feedback_tokens.extend(doc_tokens)
    # Count term frequencies
    term_counts = Counter(feedback_tokens)
    # Exclude original query tokens to avoid redundancy
    query_tokens = preprocess_text(query)
    new_terms = [
        term for term, _ in term_counts.most_common()
        if term not in query_tokens
    ][:terms_to_add]
    # Expand query
    expanded_query = query + ' ' + ' '.join(new_terms)
    # print(f"PRF Expanded Query: {expanded_query}")
    # Re-run retrieval
    return retrieve_bm25(expanded_query, top_k)

def expand_query(query, topic_id):
    """
    Explicit query expansion with topic-specific terms.
    """
    expansions = {
        'chinese-exclusion-act': '1882 immigration bill',
        'mona-lisa': 'leonardo painting',
        'statue-of-liberty': '1886 france bartholdi',
        'mothers-day': '1910 wilson',
        'electric-chair': '1890 execution',
        'yoga': 'meditation exercise',
        'motorcycle-mania': 'bike race',
        'female-pilots': 'aviation women',
        'league-of-nations': '1919 wilson treaty',
        'ping-pong-craze': 'table tennis',
        'ouija-board': 'spiritualism board'
    }
    return query + ' ' + expansions.get(topic_id, '')

def evaluate_retrieval(query, retrieved, rel_judgments, k=10):
    topic_id = query.replace(' ', '-')
    relevant = set(res_id for res_id, rel in rel_judgments.get(topic_id, []) if rel == 1)
    print(f"\nTopic: {topic_id}, Relevant result_ids: {relevant}")
    if not relevant:
        print(f"No relevant documents for {topic_id}")
        return 0.0, 0.0
    retrieved_k = [res_id for res_id, _, _ in retrieved[:k]]
    print(f"Retrieved result_ids: {retrieved_k}")
    y_true = [1 if res_id in relevant else 0 for res_id in retrieved_k]
    y_pred = [1] * len(retrieved_k)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = sum(y_true) / len(relevant) if relevant else 0.0
    print(f"Matches: {sum(y_true)} out of {len(relevant)} relevant")
    return precision, recall

# Main evaluation: PRF with and without query expansion
topics = df['topic_id'].unique().tolist()
results = {
    'baseline': {'precisions': [], 'recalls': []},
    'prf': {'precisions': [], 'recalls': []},
    'prf_with_expansion': {'precisions': [], 'recalls': []}
}

# topic = topics[0]

def evaluate_model(topic):
    query = topic.replace('-', ' ')
    topic_id = topic
    print(f"\n\nEvaluating Topic: {topic}")
    
    # 1. Baseline BM25
    print("\n--- Baseline BM25 ---")
    baseline_results = retrieve_bm25(query, top_k=10)
    p_base, r_base = evaluate_retrieval(query, baseline_results, rel_judgments, k=10)
    results['baseline']['precisions'].append(p_base)
    results['baseline']['recalls'].append(r_base)
    print(f"Baseline Precision@10: {p_base:.3f}, Recall@10: {r_base:.3f}")
    
    # 2. PRF without query expansion
    print("\n--- PRF without Query Expansion ---")
    prf_results = pseudo_relevance_feedback(query, baseline_results, top_k=10, feedback_docs=5, terms_to_add=5)
    p_prf, r_prf = evaluate_retrieval(query, prf_results, rel_judgments, k=10)
    results['prf']['precisions'].append(p_prf)
    results['prf']['recalls'].append(r_prf)
    print(f"PRF Precision@10: {p_prf:.3f}, Recall@10: {r_prf:.3f}")
    
    # 3. PRF with query expansion
    print("\n--- PRF with Query Expansion ---")
    expanded_query = expand_query(query, topic_id)
    print(f"Initial Expanded Query: {expanded_query}")
    initial_results = retrieve_bm25(expanded_query, top_k=10)
    prf_exp_results = pseudo_relevance_feedback(expanded_query, initial_results, top_k=10, feedback_docs=5, terms_to_add=5)
    p_prf_exp, r_prf_exp = evaluate_retrieval(query, prf_exp_results, rel_judgments, k=10)
    results['prf_with_expansion']['precisions'].append(p_prf_exp)
    results['prf_with_expansion']['recalls'].append(r_prf_exp)
    print(f"PRF+Expansion Precision@10: {p_prf_exp:.3f}, Recall@10: {r_prf_exp:.3f}")

# # Summarize results
# print("\n\nFinal Results:")
# for method in results:
#     avg_p = np.mean(results[method]['precisions'])
#     avg_r = np.mean(results[method]['recalls'])
#     print(f"{method.capitalize()} - Avg Precision@10: {avg_p:.3f}, Avg Recall@10: {avg_r:.3f}")

In [28]:
def get_outputs(topic):
    output = []
    query = topic.replace('-', ' ')
    for method in ['baseline', 'prf', 'prf_with_expansion']:
        if method == 'baseline':
            res = retrieve_bm25(query, top_k=10)
        elif method == 'prf':
            res = pseudo_relevance_feedback(query, retrieve_bm25(query, top_k=10), top_k=10)
        else:
            expanded_query = expand_query(query, topic)
            res = pseudo_relevance_feedback(expanded_query, retrieve_bm25(expanded_query, top_k=10), top_k=10)
        for result_id, score, _ in res:
            output.append({
                'topic_id': topic,
                'method': method,
                'result_id': result_id,
                'score': score
            })
    output_df = pd.DataFrame(output)
    output_df.to_csv('retrieval_results_prf.csv', index=False)
    # print("\nResults saved to 'retrieval_results_prf.csv'")
    return output_df

In [7]:
output_df = get_outputs(topics[0])

PRF Expanded Query: https://guides.loc.gov/chronicling america 1904 presidential election state amend nation june vote
PRF Expanded Query: https://guides.loc.gov/chronicling america 1904 presidential election  state amend nation june vote

Results saved to 'retrieval_results_prf.csv'


In [20]:
output_df

Unnamed: 0,topic_id,method,result_id,score
0,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn83030214/1920-0...,9.460762
1,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn86063381/1906-0...,6.895322
2,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn84020657/1925-0...,6.349662
3,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn84001718/1920-1...,5.772802
4,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn96060547/1919-0...,5.763385
5,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn86079068/1904-1...,5.56745
6,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn96076839/1920-0...,5.188134
7,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn96076839/1920-0...,5.188134
8,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn89066315/1919-0...,4.94954
9,https://guides.loc.gov/chronicling-america-190...,baseline,https://www.loc.gov/resource/sn83040198/1909-1...,4.732053


In [8]:
def get_categories(output_df):
    time = []
    for i in range(0, len(output_df)):
        topic_id = output_df['topic_id'][i]
        cur_time = ""
        for j in range(0, len(df)):
            if (topic_id) == df['topic_id'][j]:
                cur_time_list = ast.literal_eval(df['extracted_dates'][j])
                if (cur_time_list != []):
                    cur_time = cur_time_list[0]
                else:
                    cur_time = "No timeline found"
                break
        time.append(cur_time)
    return time

In [9]:
output_df['predicted_time'] = get_categories(output_df)

In [10]:
def get_results(output_df_sorted, categories_dict, num_per_topic = 5):
    results = categories_dict.copy()
    for i in range(0, len(output_df_sorted)):
        if output_df_sorted['predicted_time'][i] == "No timeline found":
            time = "No timeline found" 
        else:
            time = (output_df_sorted['predicted_time'][i])  
        time = str(time)
        if (categories_dict[time] < num_per_topic): #### THIS CAN BE CHANGED TO REFLECT THE NUMBER OF RESULTS WE WANT TO SEE
            if (results[time] == 0):
                results[time] = [output_df_sorted['result_id'][i]]
            else:
                results[time].append(output_df_sorted['result_id'][i])
            categories_dict[time] += 1

    return results

In [67]:
results = get_results(output_df_sorted, categories_dict)

In [68]:
results

{'June 11': ['https://www.loc.gov/resource/sn83030214/1920-08-19/ed-1/?sp=1&q=Suffrage+Ratification&r=0.19,0.286,0.837,0.392,0',
  'https://www.loc.gov/resource/sn83030214/1920-08-19/ed-1/?sp=1&q=Suffrage+Ratification&r=0.19,0.286,0.837,0.392,0',
  'https://www.loc.gov/resource/sn89066315/1919-01-16/ed-1/?sp=1&q=ratification+prohibition+amendment',
  'https://www.loc.gov/resource/sn89066315/1919-01-16/ed-1/?sp=1&q=ratification+prohibition+amendment',
  'https://www.loc.gov/resource/sn96060547/1919-01-18/ed-1/?sp=1&q=ratification+prohibition+amendment']}

In [69]:
query

'https://guides.loc.gov/chronicling america 1904 presidential election'

In [21]:
def print_results(results):
    for i in results.keys():
        print(i + ":")
        for j in results[i]:
            print(j)

In [22]:
def pipeline(query):
    output_df = get_outputs(query)
    time = get_categories(output_df)
    output_df['predicted_time'] = time
    categories = output_df['predicted_time'].unique()
    output_df_sorted = output_df.sort_values(by = 'score', ascending = False).reset_index().drop('index', axis = 1)
    
    categories_dict = {}
    for i in (output_df_sorted['predicted_time'].value_counts()).keys():
        categories_dict[i] = 0

    results = get_results(output_df_sorted, categories_dict)

    print_results(results)

In [23]:
topics

['https://guides.loc.gov/chronicling-america-1904-presidential-election',
 'https://guides.loc.gov/chronicling-america-early-women-aviation/selected-articles',
 'https://guides.loc.gov/chronicling-america-clara-h-barton',
 'https://guides.loc.gov/chronicling-america-boston-subway',
 'https://guides.loc.gov/chronicling-america-building-titanic/selected-articles',
 'https://guides.loc.gov/chronicling-america-cassius-marcellus-clay-jr',
 'https://guides.loc.gov/chronicling-america-chinese-exclusion-act',
 'https://guides.loc.gov/chronicling-america-indianapolis-500',
 'https://guides.loc.gov/chronicling-america-babe-ruth',
 'https://guides.loc.gov/chronicling-america-marie-curie',
 'https://guides.loc.gov/chronicling-america-darwin-theory-of-evolution',
 'https://guides.loc.gov/chronicling-america-eiffel-tower',
 'https://guides.loc.gov/chronicling-america-electric-chair/',
 'https://guides.loc.gov/chronicling-america-ellis-island',
 'https://guides.loc.gov/chronicling-america-american-fe

In [31]:
pipeline(topics[5])

June 21
https://www.loc.gov/resource/sn84021918/1960-09-09/ed-1/?sp=7&q=clay%2C+cassius+olympics&st=text&r=0.289,0.631,0.39,0.466,0
https://www.loc.gov/resource/sn84021918/1960-09-09/ed-1/?sp=7&q=clay%2C+cassius+olympics&st=text&r=0.289,0.631,0.39,0.466,0
https://www.loc.gov/resource/sn83045462/1963-06-19/ed-1/?sp=34&q=cassius+marcellus+clay
https://www.loc.gov/resource/sn83045462/1963-06-19/ed-1/?sp=34&q=cassius+marcellus+clay
https://www.loc.gov/resource/sn79000083/1963-03-23/ed-1/?sp=7&q=clay%2C+cassius+olympics&r=-0.138,0.178,0.563,0.359,0


{'No timeline found': ['https://www.loc.gov/resource/sn83030272/1897-12-15/ed-1/?sp=3&q=BOSTON+Boston+SUBWAY+subway',
  'https://www.loc.gov/resource/sn83030272/1897-12-15/ed-1/?sp=3&q=BOSTON+Boston+SUBWAY+subway',
  'https://www.loc.gov/resource/sn95079490/1897-10-23/ed-1/?sp=7&q=BOSTON+SUBWAY',
  'https://www.loc.gov/resource/sn95079490/1897-10-23/ed-1/?sp=7&q=BOSTON+SUBWAY',
  'https://www.loc.gov/resource/sn87060004/1905-01-19/ed-1/?sp=4&q=BOSTON+Boston+SUBWAY+subway']}