In [1]:
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

### Clean the text from headers and footers

In [2]:
def clean_text(text):
    # Remove the SNAP header
    text = re.sub(r"Policy Basics\s*–\s*SNAP", "", text, flags=re.IGNORECASE)
    
    # Remove isolated page numbers (lines that are just digits)
    text = re.sub(r"^\s*\d+\s*$", "", text, flags=re.MULTILINE)
    
    # Normalize spacing
    text = re.sub(r"\n+", "\n", text)     # collapse multiple newlines
    text = re.sub(r"\s{2,}", " ", text)   # collapse multiple spaces
    
    return text.strip()

### Extract text from the pdf

In [3]:
def extract_clean_pdf(pdf_path, cleaner):
    pages = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                cleaned = cleaner(text)
                pages.append(cleaned)
    return "\n".join(pages)

In [4]:
# File paths
medicaid_path = "Data/MEDICAID.pdf"
snap_path = "Data/SNAP.pdf"

# Extract + clean
medicaid_text = extract_clean_pdf(medicaid_path, clean_text)
snap_text = extract_clean_pdf(snap_path, clean_text)

print("Medicaid text length:", len(medicaid_text))
print("SNAP text length:", len(snap_text))

Medicaid text length: 30220
SNAP text length: 26921


### Chunking the Text

In [5]:
# Chunking
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # characters per chunk
    chunk_overlap=50, # overlap between chunks
    length_function=len,
)

medicaid_chunks = splitter.split_text(medicaid_text)
snap_chunks = splitter.split_text(snap_text)

print(f"Medicaid chunks: {len(medicaid_chunks)}")
print(f"SNAP chunks: {len(snap_chunks)}")

# Preview one chunk
print(medicaid_chunks[0][:300])
print(snap_chunks[0][:300])

Medicaid chunks: 67
SNAP chunks: 62
Medicaid and CHIP Overview
September 2024
Thisinformationisintendedonlyfortheuseofentitiesandindividualscertified
to serve as Navigators, certified application counselors, or non-Navigator
personnel in a Federally-facilitated Marketplace. The terms “Federally-
facilitated Marketplace” and “FFM,” as 
X`
The Supplemental Nutrition Assistance Program
(SNAP)
SNAP, formerly known as the Food Stamp Program, is the nation’s most effective
anti-hunger program. In an average month in 2024, SNAP helped an average of more
than 41 million low-income people in the United States afford a nutritionally
adequa


### Build embeddings

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Combine your cleaned chunks from Step 2
corpus = medicaid_chunks + snap_chunks

# Keep an ID → text mapping so we can show the chunk later
id2text = {i: chunk for i, chunk in enumerate(corpus)}

# Load the embedding model (MiniLM). Fast and accurate on CPU.
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

# Encode all chunks into embeddings (shape: [num_chunks, 384])
embeddings = embedder.encode(
    corpus,
    convert_to_numpy=True,
    show_progress_bar=True
)

# Normalize embeddings so dot product = cosine similarity
# (FAISS IndexFlatIP does inner product; with unit vectors it equals cosine)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)





Batches:   0%|          | 0/5 [00:00<?, ?it/s]

### Build FAISS Index

In [7]:
import faiss

# Build the FAISS index for Inner Product (cosine once normalized)
dim = embeddings.shape[1]                 # 384 for MiniLM
index = faiss.IndexFlatIP(dim)            # IP = inner product
index.add(embeddings)                     # add all vectors to the index

print("Embeddings shape:", embeddings.shape)  # e.g., (N, 384)
print("FAISS index size:", index.ntotal)      # should be N

Embeddings shape: (129, 384)
FAISS index size: 129


### Defining Retrieval function and test it

In [8]:
def search(query: str, k: int = 3):
    """Return top-k most similar chunks to the query (cosine similarity)."""
    # Encode the query to the same vector space
    q = embedder.encode([query], convert_to_numpy=True)
    # Normalize so inner product = cosine similarity
    q = q / np.linalg.norm(q, axis=1, keepdims=True)
    # Search FAISS for top-k neighbors
    scores, idxs = index.search(q, k)
    # Package results as list of dicts: rank, score, text
    results = [
        {"rank": i+1, "score": float(scores[0][i]), "text": id2text[idx]}
        for i, idx in enumerate(idxs[0])
    ]
    return results

# Quick smoke test
queries = [
    "Who is eligible for SNAP?",
    "Medicaid coverage for low-income adults",
]
for q in queries:
    print(f"\nTop hits for: {q}")
    hits = search(q, k=3)
    for h in hits:
        print(f"\n[{h['rank']}] score={h['score']:.3f}\n{h['text'][:300]}...")



Top hits for: Who is eligible for SNAP?

[1] score=0.705
adults (60 years and older) and people with disabilities living on fixed incomes, and other individuals and
households with low incomes. Nearly 62 percent of SNAP participants are in families with children, and
nearly 37 percent are in households with older adults or people with disabilities. After ...

[2] score=0.687
with a child under age 18, an adult age 60 or older, or an individual who is disabled. Children under age 18
constitute 40 percent of all SNAP participants. Nearly 62 percent of SNAP participants are in families with
children; nearly 37 percent are in households with older adults or disabled people....

[3] score=0.678
SNAP and instead receive capped block grants for nutrition assistance.
Who Is Eligible for SNAP?
SNAP is broadly available to households with low incomes. SNAP eligibility rules and benefit levels are, for
the most part, set at the federal level and uniform across the nation, though states have flex

### Summarization Pipeline

In [9]:
from transformers import pipeline

# Load Hugging Face summarizer
summarizer = pipeline("summarization", model="google/flan-t5-base", tokenizer="google/flan-t5-base")

def summarize_text(text, max_len=200, min_len=40):
    """Summarize retrieved chunks into a concise answer."""
    try:
        summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return f"Error during summarization: {e}"

Device set to use cuda:0


### Retrieval and Summarization in a single function

In [15]:
def query_pipeline(query, k=3):
    # Step 1: embed + search
    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    scores, ids = index.search(q_emb, k)

    # Step 2: get top chunks
    retrieved = [id2text[i] for i in ids[0]]

    # Step 3: join chunks into one context
    combined_text = " ".join(retrieved)

    # Step 4: summarize into a concise answer
    answer = summarize_text(combined_text+query)
    return answer

In [17]:
# Example query
print(query_pipeline("Individuals who receive a Medicaid or CHIP denial outside of OE are eligible for an SEP based on the denial of Medicaid or CHIP eligibility ONLY if they applied for coverage during OE or during another previous SEP window and were denied Medicaid or CHIP coverage after OE or their original SEP window ended (but they may still qualify for the Medicaid Unwinding SEP). How long do these consumers have to sign up for a Marketplace plan after they receive their denial?"))

A. 30 days B. 45 days C. 60 days D. 90 days Knowledge Check #3 Answer Answer: B. False If they originally applied at the Marketplace during OE or during a SEP agency for a final eligibility determination;


In [19]:
def query_pipeline2(query, k=3):
    # Step 1: embed + search
    q_emb = embedder.encode([query], convert_to_numpy=True).astype("float32")
    q_emb = q_emb / np.linalg.norm(q_emb, axis=1, keepdims=True)
    scores, ids = index.search(q_emb, k)

    # Step 2: get top chunks
    retrieved = [id2text[i] for i in ids[0]]

    return retrieved

In [21]:
print(query_pipeline2("Consumers who qualify for an SEP to purchase a Marketplace plan due to a post-OE (or post-SEP) Medicaid/CHIP denial are not eligible to receive Marketplace coverage back to the effective date they would have received based on the date of their original Marketplace application. True or False?"))

['This includes post-enrollment period Medicaid or CHIP denials.\nKnowledge Check #4\nConsumers who qualify for an SEP to purchase a Marketplace plan due to a\npost-OE (or post-SEP) Medicaid/CHIP denial are not eligible to receive\nMarketplace coverage back to the effective date they would have received\nbased on the date of their original Marketplace application.\nA. True\nB. False\nKnowledge Check #4 Answer\nAnswer: B. False\nIf they originally applied at the Marketplace during OE or during a SEP', 'window, consumers who qualify for an SEP to purchase a Marketplace\nplan due to a post-OE (or post-SEP) Medicaid/CHIP denial may be\neligible to receive a retroactive coverage effective date back to the\neffective date they would have received based on the date of their\noriginal Marketplace application.\nKnowledge Check #5\nIn order to receive coverage retroactively, what must the\nconsumer do?\nA. Nothing\nB. Call the Marketplace Call Center\nC. Pay any outstanding premiums\nD. Both B a