In [4]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')

# Load data
df = pd.read_csv('prudhvi_output.csv')

# Initialize stemmer and stopwords
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Handle NaN or non-string inputs
    if pd.isna(text) or not isinstance(text, str):
        return []
    try:
        # Tokenize and lowercase
        tokens = word_tokenize(text.lower())
        # Keep alphabetic tokens (stemmed), numbers (unchanged), and specific alphanumeric (e.g., "12th")
        processed_tokens = []
        for token in tokens:
            # Skip stopwords
            if token in stop_words:
                continue
            # Keep years (4-digit numbers), e.g., "1882"
            if re.match(r'^\d{4}$', token):
                processed_tokens.append(token)
            # Keep ordinals (e.g., "12th", "8th") or section numbers
            elif re.match(r'^\d+(st|nd|rd|th)$', token):
                processed_tokens.append(token)
            # Keep alphabetic tokens, stemmed
            elif token.isalpha():
                processed_tokens.append(ps.stem(token))
            # Optionally keep money amounts (e.g., "$500") - uncomment if needed
            # elif re.match(r'^\$\d+$', token):
            #     processed_tokens.append(token)
        return processed_tokens
    except Exception as e:
        print(f"Error processing text: {text[:50]}... Error: {e}")
        return []

# Apply preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# Organize relevance judgments
rel_judgments = df.groupby('topic_id')[['result_id', 'rel']].apply(lambda x: list(zip(x['result_id'], x['rel']))).to_dict()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prudhvivuda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Error processing text: 
            The Chinese Bill.
            The bil... Error: name 're' is not defined
Error processing text: 
            CHINESE EXCLUSION CONVENTION OPENS
  ... Error: name 're' is not defined
Error processing text: 
              THE Chinese Exclusion Convention
  ... Error: name 're' is not defined
Error processing text: 
              COMMISSIONERS ABE NAMED.
          ... Error: name 're' is not defined
Error processing text: 
              The Yellow Peril.
              It ... Error: name 're' is not defined
Error processing text: 
              The Senate failed to substitute' th... Error: name 're' is not defined
Error processing text: 
              President Roosevelt is reported
   ... Error: name 're' is not defined
Error processing text: 
              THE CHINESE
              EXCLUSION... Error: name 're' is not defined
Error processing text: 
              CHINESE SCORES
              EXCLUS... Error: name 're' is not defined
Error processing te

In [None]:
from rank_bm25 import BM25Okapi
corpus = df['processed_text'].tolist()
bm25 = BM25Okapi(corpus)
query = preprocess_text("Chinese exclusion act 1882")
scores = bm25.get_scores(query)