In [7]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

# Load data
df = pd.read_csv('prudhvi_output.csv')

# Initialize stemmer and stopwords
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Handle NaN or non-string inputs
    if pd.isna(text) or not isinstance(text, str):
        return []
    try:
        # Tokenize and lowercase
        tokens = word_tokenize(text.lower())
        # Keep alphabetic tokens (stemmed), numbers (unchanged), and specific alphanumeric (e.g., "12th")
        processed_tokens = []
        for token in tokens:
            # Skip stopwords
            if token in stop_words:
                continue
            # Keep years (4-digit numbers), e.g., "1882"
            if re.match(r'^\d{4}$', token):
                processed_tokens.append(token)
            # Keep ordinals (e.g., "12th", "8th") or section numbers
            elif re.match(r'^\d+(st|nd|rd|th)$', token):
                processed_tokens.append(token)
            # Keep alphabetic tokens, stemmed
            elif token.isalpha():
                processed_tokens.append(ps.stem(token))
            # Optionally keep money amounts (e.g., "$500") - uncomment if needed
            # elif re.match(r'^\$\d+$', token):
            #     processed_tokens.append(token)
        return processed_tokens
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return []

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Organize relevance judgments
rel_judgments = df.groupby('topic_id')[['result_id', 'rel']].apply(lambda x: list(zip(x['result_id'], x['rel']))).to_dict()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
df

Unnamed: 0,topic_id,result_id,rel,text,processed_text
0,https://guides.loc.gov/chronicling-america-chi...,https://www.loc.gov/resource/sn85053040/1882-0...,1,\n The Chinese Bill.\n T...,"[chines, bill, bill, sign, presid, 8th, entitl..."
1,https://guides.loc.gov/chronicling-america-chi...,https://www.loc.gov/resource/sn85066387/1901-1...,1,\n CHINESE EXCLUSION CONVENTION OPE...,"[chines, exclus, convent, open, fight, defens,..."
2,https://guides.loc.gov/chronicling-america-chi...,https://www.loc.gov/resource/sn85066387/1901-1...,1,\n THE Chinese Exclusion Conventi...,"[chines, exclus, convent, complet, labor, yest..."
3,https://guides.loc.gov/chronicling-america-chi...,https://www.loc.gov/resource/sn85066387/1901-1...,1,\n COMMISSIONERS ABE NAMED.\n ...,"[commission, abe, name, chairman, geari, appoi..."
4,https://guides.loc.gov/chronicling-america-chi...,https://www.loc.gov/resource/46032385/1901-12-...,1,\n The Yellow Peril.\n ...,"[yellow, peril, necessari, present, congress, ..."
...,...,...,...,...,...
125,\n https://guides.loc.gov/chronicling-ameri...,\n https://chroniclingamerica.loc.g...,1,\n A PAGE HOW YOUNG GIRLS STUDY TH...,"[page, young, girl, studi, hindoo, method, res..."
126,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n POsition Is Everything in Life t...,"[posit, everyth, life, yogi, practition, tiwar..."
127,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n ¿Qué Es El Yoga? Yoga es una pal...,"[es, el, yoga, yoga, es, una, palabra, sánscri..."
128,\n https://guides.loc.gov/chronicling-ameri...,\n https://www.loc.gov/resource/sn8...,1,\n Con la Práctica del Yoga se Pued...,"[con, la, práctica, del, yoga, se, pued, vivir..."


In [10]:
from rank_bm25 import BM25Okapi
corpus = df['processed_text'].tolist()
bm25 = BM25Okapi(corpus)
query = preprocess_text("Chinese exclusion act")
scores = bm25.get_scores(query)

In [11]:
scores

array([ 7.03346405, 10.59085899,  7.52595357, 12.07065187,  9.64555593,
        5.58150414, 10.50632099, 10.64023093, 10.43307407, 11.42167851,
       11.21743257, 10.0993923 ,  0.        ,  0.        ,  0.        ,
        1.63643724,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.58290796,  0.        ,  3.08288294,  1.12337456,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  2.10562083,  0.        ,  1.27002255,  0.        ,
        1.01685086,  0.83682834,  0.64462086,  1.03943229,  0.76631662,
        0.96735802,  0.        ,  0.        ,  0.        ,  0.  

In [None]:
import numpy as np

result_ids = df['result_id'].tolist()

def retrieve_bm25(query, top_k=10):
    # Preprocess query
    query_tokens = preprocess_text(query)
    if not query_tokens:
        return []
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return [
        (result_ids[idx], scores[idx], df['text'].iloc[idx])
        for idx in top_indices
    ]

query = "Chinese exclusion act"
results = retrieve_bm25(query, top_k=5)
print(f"\nQuery: {query}")
for result_id, score, text in results:
    print(f"Result ID: {result_id}, Score: {score:.3f}")
    print(f"Text: {text[:100]}...")  
    print("-" * 80)


Query: Chinese exclusion act
Result ID: https://www.loc.gov/resource/sn85066387/1901-11-26/ed-1/?sp=12&q=act+Chinese+exclusion&r=-0.867,0.229,2.735,1.238,0, Score: 12.071
Text: 
              COMMISSIONERS ABE NAMED.
              Chairman Geary Appoints Men to
              C...
--------------------------------------------------------------------------------
Result ID: https://www.loc.gov/resource/sn86063381/1906-01-10/ed-1/?sp=1&q=act+Chinese+exclusion&r=-0.264,0.404,1.184,0.536,0, Score: 11.422
Text: 
              EXCLUSION LAW IS VALID
              COURT OF APPEALS DECLARES
              CONGRESS...
--------------------------------------------------------------------------------
Result ID: https://www.loc.gov/resource/sn86076200/1901-08-03/ed-1/?sp=1&q=chinese+exclusion+act&r=0.053,0.18,0.768,0.348,0, Score: 11.217
Text: 
              CHINESE EXCLUSION.
              Miners' I nlon Petition* tor
              enactment...
--------------------------------------------------------