In [2]:
import re
from pathlib import Path
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter

Cleaning data to remove headers, footers and noise

In [3]:

def clean_policy_text(text: str) -> str:
    """
    Clean SNAP & Medicaid policy documents by removing headers, footers, and noise.
    Works for both CRS SNAP report and CBPP Medicaid report.
    """
    # Remove common headers/footers
    text = re.sub(r"Policy Basics\s*–.*?(SNAP|Medicaid)", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Supplemental Nutrition Assistance Program.*?(Eligibility and Benefits)?", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Congressional Research Service", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Center on Budget and Policy Priorities\s*\|\s*cbpp\.org", "", text, flags=re.IGNORECASE)
    
    # Remove CRS-style page markers like 'CRS-22'
    text = re.sub(r"CRS-\d+", "", text)
    
    # Remove isolated page numbers
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
    
    # Normalize spacing
    text = re.sub(r"\n+", "\n", text)        # collapse multiple newlines
    text = re.sub(r"\s{2,}", " ", text)      # collapse multiple spaces
    
    return text.strip()


Extracting text from clean text

In [12]:
def extract_clean_pdf(pdf_path, cleaner):
    pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                cleaned = cleaner(text)
                pages.append(cleaned)
    return "\n".join(pages)

In [5]:
medicaid_path = "data/Medicaid.pdf"
snap_path = "data/SNAP.pdf"

In [13]:
# Extract + clean
medicaid_text = extract_clean_pdf(medicaid_path, clean_policy_text)
snap_text = extract_clean_pdf(snap_path, clean_policy_text)

print("Medicaid text length:", len(medicaid_text))
print("SNAP text length:", len(snap_text))

Medicaid text length: 17596
SNAP text length: 80366


In [15]:
#Chunking

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,    # ~500 characters per chunk
    chunk_overlap=50,  # overlap to preserve context
    length_function=len
)

medicaid_chunks = splitter.split_text(medicaid_text)
snap_chunks = splitter.split_text(snap_text)

print("Medicaid chunks:", len(medicaid_chunks))
print("SNAP chunks:", len(snap_chunks))

# Preview first chunk
print("\nSample Medicaid chunk:\n", medicaid_chunks[0][:300])
print("\nSample SNAP chunk:\n", snap_chunks[0][:300])

Medicaid chunks: 40
SNAP chunks: 179

Sample Medicaid chunk:
 Introduction to Medicaid
Created in 1965, Medicaid provides health coverage to low-income families and
individuals, including children, parents, pregnant people seniors, and people with
disabilities. It is funded jointly by the federal government and the states. Within
federal guidelines, each state

Sample SNAP chunk:
 (SNAP): A Primer on Eligibility and Benefits
Updated November 13, 2024
https://crsreports.congress.gov
R42505
SUMMARY
R42505
November 13, 2024
(SNAP): A Primer on Eligibility and Benefits
Randy Alison Aussenberg
The (SNAP), formerly called the Food Stamp Specialist in Nutrition
Program, is designed 


In [18]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# 2) Combine your cleaned chunks from Step 2
#    (Assumes you already have: medicaid_chunks, snap_chunks)
corpus = medicaid_chunks + snap_chunks

# 3) Keep an ID → text mapping so we can show the chunk later
id2text = {i: chunk for i, chunk in enumerate(corpus)}

# 4) Load the embedding model (MiniLM). Fast and accurate on CPU.
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

# 5) Encode all chunks into embeddings (shape: [num_chunks, 384])
#    show_progress_bar=True gives a nice progress indicator.
embeddings = embedder.encode(
    corpus,
    convert_to_numpy=True,
    show_progress_bar=True
)

# 6) Normalize embeddings so dot product = cosine similarity
#    (FAISS IndexFlatIP does inner product; with unit vectors it equals cosine)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# 7) Build the FAISS index for Inner Product (cosine once normalized)
dim = embeddings.shape[1]                 # 384 for MiniLM
index = faiss.IndexFlatIP(dim)            # IP = inner product
index.add(embeddings)                     # add all vectors to the index

print("Embeddings shape:", embeddings.shape)  # e.g., (N, 384)
print("FAISS index size:", index.ntotal)      # should be N


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 7/7 [00:16<00:00,  2.35s/it]

Embeddings shape: (219, 384)
FAISS index size: 219





In [20]:
def search(query: str, k: int = 3):
    """Return top-k most similar chunks to the query (cosine similarity)."""
    # a) Encode the query to the same vector space
    q = embedder.encode([query], convert_to_numpy=True)
    # b) Normalize so inner product = cosine similarity
    q = q / np.linalg.norm(q, axis=1, keepdims=True)
    # c) Search FAISS for top-k neighbors
    scores, idxs = index.search(q, k)
    # d) Package results as list of dicts: rank, score, text
    results = [
        {"rank": i+1, "score": float(scores[0][i]), "text": id2text[idx]}
        for i, idx in enumerate(idxs[0])
    ]
    return results

In [23]:
# Quick smoke test
queries = [
    "What is the minimum monthly SNAP benefit for one- or two-person households in FY2025?",
    "Medicaid coverage for low-income adults",
]
for q in queries:
    print(f"\nTop hits for: {q}")
    hits = search(q, k=3)
    for h in hits:
        print(f"\n[{h['rank']}] score={h['score']:.3f}\n{h['text'][:300]}...")


Top hits for: What is the minimum monthly SNAP benefit for one- or two-person households in FY2025?

[1] score=0.911
and others take into account the general availability of SNAP benefits in deciding what level of
benefits to provide.
Minimum Benefit
Eligible one- or two-person households are guaranteed a minimum monthly benefit allotment
equal to 8% of the maximum benefit for a one-person household. This 2008 cha...

[2] score=0.811
and the District of Columbia. This means that if the benefit calculation for a one- or two-person
household yields a result of less than $23, that household is guaranteed to receive $23 a month. In
FY2019, 10% of SNAP households received the minimum benefit.40
Issuance of Benefits
Benefit issuance i...

[3] score=0.800
and children, and shelters for the homeless.
If a household includes an elderly or disabled member, the household is entitled to different
SNAP deduction rules as well as some different financial eligibility rules (discussed in the next
sec

In [24]:
from transformers import pipeline

# Load Hugging Face summarizer
summarizer = pipeline("summarization", model="google/flan-t5-base", tokenizer="google/flan-t5-base")

def summarize_text(text, max_len=150, min_len=40):
    """Summarize retrieved chunks into a concise answer."""
    try:
        summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {e}"


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0


In [27]:
def query_pipeline(query, k=3):
    # Step 1: embed + search
    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    scores, ids = index.search(q_emb, k)

    # Step 2: get top chunks
    retrieved = [id2text[i] for i in ids[0]]

    # Step 3: join chunks into one context
    combined_text = " ".join(retrieved)

    # Step 4: summarize into a concise answer
    answer = summarize_text(combined_text)
    return answer

# Example query
print(query_pipeline("What percentage of SNAP federal spending in FY2023 was spent on benefits?"))


In FY2023, an average of 42.2 million individuals in 22.3 million households participated in SNAP each month. Monthly benefits averaged $211.65 per person and $400.15 per household.
