In [52]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.metrics import precision_score
from collections import Counter

# NLTK downloads
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
try:
    df = pd.read_csv('prudhvi_output.csv')
except FileNotFoundError:
    print("Error: 'prudhvi_output.csv' not found. Please check the file path.")
    exit(1)

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    try:
        tokens = word_tokenize(text.lower())
        processed_tokens = []
        for token in tokens:
            if token in stop_words:
                continue
            if re.match(r'^\d{4}$', token) or re.match(r'^\d+(st|nd|rd|th)$', token):
                processed_tokens.append(token)
            elif token.isalpha():
                processed_tokens.append(ps.stem(token))
        return processed_tokens
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return []

In [54]:
df['processed_text'] = df['text'].apply(preprocess_text)

# Debug: Inspect data
print("\nData Overview:")
print(df[['topic_id', 'result_id', 'rel']].head(10))
print("\nRel column distribution:")
print(df['rel'].value_counts(dropna=False))


Data Overview:
                                            topic_id  \
0  https://guides.loc.gov/chronicling-america-chi...   
1  https://guides.loc.gov/chronicling-america-chi...   
2  https://guides.loc.gov/chronicling-america-chi...   
3  https://guides.loc.gov/chronicling-america-chi...   
4  https://guides.loc.gov/chronicling-america-chi...   
5  https://guides.loc.gov/chronicling-america-chi...   
6  https://guides.loc.gov/chronicling-america-chi...   
7  https://guides.loc.gov/chronicling-america-chi...   
8  https://guides.loc.gov/chronicling-america-chi...   
9  https://guides.loc.gov/chronicling-america-chi...   

                                           result_id  rel  
0  https://www.loc.gov/resource/sn85053040/1882-0...    1  
1  https://www.loc.gov/resource/sn85066387/1901-1...    1  
2  https://www.loc.gov/resource/sn85066387/1901-1...    1  
3  https://www.loc.gov/resource/sn85066387/1901-1...    1  
4  https://www.loc.gov/resource/46032385/1901-12-...    1  
5  http

In [55]:
# Organize relevance judgments
rel_judgments = df.groupby('topic_id')[['result_id', 'rel']].apply(
    lambda x: list(zip(x['result_id'].astype(str).str.strip(), x['rel'].astype(int)))
).to_dict()

print("\nRelevance Judgments:")
for topic, judgments in rel_judgments.items():
    relevant_count = sum(1 for _, rel in judgments if rel == 1)
    print(f"Topic: {topic}, Judgments: {len(judgments)}, Relevant: {relevant_count}")
    if judgments:
        print(f"Sample result_ids: {[res_id for res_id, _ in judgments[:2]]}")


Relevance Judgments:
Topic: 
    https://guides.loc.gov/chronicling-america-motorcycle-mania
    , Judgments: 9, Relevant: 9
Sample result_ids: ['https://www.loc.gov/resource/sn85042591/1869-01-22/ed-1/?sp=1&q=motorcycle+motorcycles+vehicle+street+race&r=0.262,0.46,0.524,0.273,0', 'https://www.loc.gov/resource/sn82015104/1895-11-16/ed-1/?sp=1&q=motorcycle+motorcycles+vehicle+street+race']
Topic: 
    https://guides.loc.gov/chronicling-america-yoga
    , Judgments: 8, Relevant: 8
Sample result_ids: ['https://chroniclingamerica.loc.gov/lccn/sn84026749/1904-12-18/ed-1/seq-30/#words=Vedanta+Vivekananda+yogis+Swami+yogi+yoga+Yoga', 'https://chroniclingamerica.loc.gov/lccn/sn84026749/1908-05-17/ed-1/seq-37/#words=yoga+Yoga+YOGA+Yogi+Yogis']
Topic: 
   https://guides.loc.gov/chronicling-america-theft-mona-lisa
   , Judgments: 12, Relevant: 12
Sample result_ids: ['https://chroniclingamerica.loc.gov/lccn/sn83016810/1910-08-20/ed-1/seq-1/#words=Lisa+Mona', 'https://chroniclingamerica.loc.gov/lc

In [56]:
# Initialize BM25
corpus = df['processed_text'].tolist()
result_ids = df['result_id'].astype(str).str.strip().tolist()
bm25 = BM25Okapi(corpus)

def retrieve_bm25(query, top_k=10):
    query_tokens = preprocess_text(query)
    if not query_tokens:
        return []
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return [
        (result_ids[idx], scores[idx], corpus[idx])
        for idx in top_indices
    ]

def pseudo_relevance_feedback(query, initial_results, top_k=10, feedback_docs=5, terms_to_add=5):
    """
    Apply PRF by extracting terms from top feedback_docs and expanding the query.
    """
    if not initial_results:
        return initial_results
    # Get top feedback_docs
    feedback_indices = [idx for idx, _ in enumerate(initial_results[:feedback_docs])]
    # Collect all tokens from these documents
    feedback_tokens = []
    for _, _, doc_tokens in initial_results[:feedback_docs]:
        feedback_tokens.extend(doc_tokens)
    # Count term frequencies
    term_counts = Counter(feedback_tokens)
    # Exclude original query tokens to avoid redundancy
    query_tokens = preprocess_text(query)
    new_terms = [
        term for term, _ in term_counts.most_common()
        if term not in query_tokens
    ][:terms_to_add]
    # Expand query
    expanded_query = query + ' ' + ' '.join(new_terms)
    print(f"PRF Expanded Query: {expanded_query}")
    # Re-run retrieval
    return retrieve_bm25(expanded_query, top_k)

def expand_query(query, topic_id):
    """
    Explicit query expansion with topic-specific terms.
    """
    expansions = {
        'chinese-exclusion-act': '1882 immigration bill',
        'mona-lisa': 'leonardo painting',
        'statue-of-liberty': '1886 france bartholdi',
        'mothers-day': '1910 wilson',
        'electric-chair': '1890 execution',
        'yoga': 'meditation exercise',
        'motorcycle-mania': 'bike race',
        'female-pilots': 'aviation women',
        'league-of-nations': '1919 wilson treaty',
        'ping-pong-craze': 'table tennis',
        'ouija-board': 'spiritualism board'
    }
    return query + ' ' + expansions.get(topic_id, '')

def evaluate_retrieval(query, retrieved, rel_judgments, k=10):
    topic_id = query.replace(' ', '-')
    relevant = set(res_id for res_id, rel in rel_judgments.get(topic_id, []) if rel == 1)
    print(f"\nTopic: {topic_id}, Relevant result_ids: {relevant}")
    if not relevant:
        print(f"No relevant documents for {topic_id}")
        return 0.0, 0.0
    retrieved_k = [res_id for res_id, _, _ in retrieved[:k]]
    print(f"Retrieved result_ids: {retrieved_k}")
    y_true = [1 if res_id in relevant else 0 for res_id in retrieved_k]
    y_pred = [1] * len(retrieved_k)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = sum(y_true) / len(relevant) if relevant else 0.0
    print(f"Matches: {sum(y_true)} out of {len(relevant)} relevant")
    return precision, recall

# Main evaluation: PRF with and without query expansion
topics = df['topic_id'].unique().tolist()
results = {
    'baseline': {'precisions': [], 'recalls': []},
    'prf': {'precisions': [], 'recalls': []},
    'prf_with_expansion': {'precisions': [], 'recalls': []}
}

for topic in topics:
    query = topic.replace('-', ' ')
    topic_id = topic
    print(f"\n\nEvaluating Topic: {topic}")

    # 1. Baseline BM25
    print("\n--- Baseline BM25 ---")
    baseline_results = retrieve_bm25(query, top_k=10)
    p_base, r_base = evaluate_retrieval(query, baseline_results, rel_judgments, k=10)
    results['baseline']['precisions'].append(p_base)
    results['baseline']['recalls'].append(r_base)
    print(f"Baseline Precision@10: {p_base:.3f}, Recall@10: {r_base:.3f}")

    # 2. PRF without query expansion
    print("\n--- PRF without Query Expansion ---")
    prf_results = pseudo_relevance_feedback(query, baseline_results, top_k=10, feedback_docs=5, terms_to_add=5)
    p_prf, r_prf = evaluate_retrieval(query, prf_results, rel_judgments, k=10)
    results['prf']['precisions'].append(p_prf)
    results['prf']['recalls'].append(r_prf)
    print(f"PRF Precision@10: {p_prf:.3f}, Recall@10: {r_prf:.3f}")

    # 3. PRF with query expansion
    print("\n--- PRF with Query Expansion ---")
    expanded_query = expand_query(query, topic_id)
    print(f"Initial Expanded Query: {expanded_query}")
    initial_results = retrieve_bm25(expanded_query, top_k=10)
    prf_exp_results = pseudo_relevance_feedback(expanded_query, initial_results, top_k=10, feedback_docs=5, terms_to_add=5)
    p_prf_exp, r_prf_exp = evaluate_retrieval(query, prf_exp_results, rel_judgments, k=10)
    results['prf_with_expansion']['precisions'].append(p_prf_exp)
    results['prf_with_expansion']['recalls'].append(r_prf_exp)
    print(f"PRF+Expansion Precision@10: {p_prf_exp:.3f}, Recall@10: {r_prf_exp:.3f}")

# Summarize results
print("\n\nFinal Results:")
for method in results:
    avg_p = np.mean(results[method]['precisions'])
    avg_r = np.mean(results[method]['recalls'])
    print(f"{method.capitalize()} - Avg Precision@10: {avg_p:.3f}, Avg Recall@10: {avg_r:.3f}")




Evaluating Topic: https://guides.loc.gov/chronicling-america-chinese-exclusion-act

--- Baseline BM25 ---

Topic: https://guides.loc.gov/chronicling-america-chinese-exclusion-act, Relevant result_ids: {'https://www.loc.gov/resource/46032385/1901-12-06/ed-1/?sp=1&q=act+Chinese+exclusion&r=-0.286,0.087,1.588,0.719,0', 'https://www.loc.gov/resource/sn86063381/1906-01-10/ed-1/?sp=1&q=act+Chinese+exclusion&r=-0.264,0.404,1.184,0.536,0', 'https://www.loc.gov/resource/sn85066387/1902-04-17/ed-1/?sp=1&q=CHINESE+Chinese+exclusion+EXCLUSION', 'https://www.loc.gov/resource/sn84024827/1905-06-30/ed-1/?sp=4&q=ACT+act+Chinese+CHINESE+exclusion+EXCLUSION&r=-0.163,0.262,1.084,0.491,0', 'https://www.loc.gov/resource/sn87065462/1905-06-21/ed-1/?sp=7&q=Act+Chinese+Exclusion&r=-0.236,0.666,0.858,0.389,0', 'https://www.loc.gov/resource/sn86076200/1901-08-03/ed-1/?sp=1&q=chinese+exclusion+act&r=0.053,0.18,0.768,0.348,0', 'https://www.loc.gov/resource/sn85066387/1901-11-26/ed-1/?sp=12&q=act+Chinese+exclusi

In [None]:
output = []
for topic in topics:
    query = topic.replace('-', ' ')
    for method in ['baseline', 'prf', 'prf_with_expansion']:
        if method == 'baseline':
            res = retrieve_bm25(query, top_k=10)
        elif method == 'prf':
            res = pseudo_relevance_feedback(query, retrieve_bm25(query, top_k=10), top_k=10)
        else:
            expanded_query = expand_query(query, topic)
            res = pseudo_relevance_feedback(expanded_query, retrieve_bm25(expanded_query, top_k=10), top_k=10)
        for result_id, score, _ in res:
            output.append({
                'topic_id': topic,
                'method': method,
                'result_id': result_id,
                'score': score
            })
output_df = pd.DataFrame(output)
output_df.to_csv('retrieval_results_prf.csv', index=False)
print("\nResults saved to 'retrieval_results_prf.csv'")

PRF Expanded Query: https://guides.loc.gov/chronicling america chinese exclusion act peopl labor petit question noe
PRF Expanded Query: https://guides.loc.gov/chronicling america chinese exclusion act  peopl labor petit question noe
PRF Expanded Query: https://guides.loc.gov/chronicling america eiffel tower feet pari structur year great
PRF Expanded Query: https://guides.loc.gov/chronicling america eiffel tower  feet pari structur year great
PRF Expanded Query: https://guides.loc.gov/chronicling america electric chair/ tho death would prison bo
PRF Expanded Query: https://guides.loc.gov/chronicling america electric chair/  tho death would prison bo
PRF Expanded Query: https://guides.loc.gov/chronicling america american female pilots miss presid scott women j
PRF Expanded Query: https://guides.loc.gov/chronicling america american female pilots  miss presid scott women j
PRF Expanded Query: https://guides.loc.gov/chronicling america league of nations shall council state execut world
PRF 