In [4]:
import json
import os
from dotenv import load_dotenv

load_dotenv()

with open("../data/rag_mock_data.json") as f:
    docs = json.load(f)

print("Loaded sections:", len(docs))
docs[0]

Loaded sections: 2


{'document_id': 'housing_policy_2025',
 'title': 'University Residences Guidelines and Policies',
 'domain': 'housing',
 'url': 'https://www.housing.purdue.edu/campus-living/resources/guidelines-policies/ur-guidelines.html',
 'effective_date': '2025-12-15',
 'sections': [{'section_title': 'Emergency Procedures',
   'subsections': [{'section_title': 'Accidents',
     'text': 'Contact the nearest staff member or the main office for assistance. Staff will arrange ambulance transportation and other needed help. Dial 9-1-1 from any campus telephone for immediate contact with the Purdue Police.'},
    {'section_title': 'Bomb Threats',
     'text': 'An intermittent signal on the fire alarm system indicates a bomb threat has been received. If the signal becomes continuous, the building must be totally evacuated.'},
    {'section_title': 'Fire or Explosion',
     'text': 'Activate the nearest fire alarm to cause evacuation of the building. Report location of the fire if possible.'},
     'text'

In [5]:
import json

with open("../data/rag_mock_data.json") as f:
    raw_docs = json.load(f)

def flatten_policy_json(raw_docs):
    flat = []

    for doc in raw_docs:
        doc_id = doc.get("document_id")
        doc_title = doc.get("title")
        url = doc.get("url")
        domain = doc.get("domain")
        effective_date = doc.get("effective_date")

        for sec in doc.get("sections", []):
            sec_title = sec.get("section_title")

            # Case 1: section has direct text
            if "text" in sec and sec["text"]:
                flat.append({
                    "doc_id": doc_id,
                    "doc_title": doc_title,
                    "url": url,
                    "domain": domain,
                    "effective_date": effective_date,
                    "section_title": sec_title,
                    "subsection_title": None,
                    "text": sec["text"],
                })

            # Case 2: section has subsections
            for sub in sec.get("subsections", []):
                sub_title = sub.get("section_title")
                sub_text = sub.get("text")
                if sub_text:
                    flat.append({
                        "doc_id": doc_id,
                        "doc_title": doc_title,
                        "url": url,
                        "domain": domain,
                        "effective_date": effective_date,
                        "section_title": sec_title,
                        "subsection_title": sub_title,
                        "text": sub_text,
                    })

    return flat

docs = flatten_policy_json(raw_docs)
print("Flat entries:", len(docs))
docs[0]

Flat entries: 105


{'doc_id': 'housing_policy_2025',
 'doc_title': 'University Residences Guidelines and Policies',
 'url': 'https://www.housing.purdue.edu/campus-living/resources/guidelines-policies/ur-guidelines.html',
 'domain': 'housing',
 'effective_date': '2025-12-15',
 'section_title': 'Emergency Procedures',
 'subsection_title': 'Accidents',
 'text': 'Contact the nearest staff member or the main office for assistance. Staff will arrange ambulance transportation and other needed help. Dial 9-1-1 from any campus telephone for immediate contact with the Purdue Police.'}

In [6]:
def chunk_text(text, max_words=140, overlap=30):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + max_words
        chunks.append(" ".join(words[start:end]))
        start += max_words - overlap
    return chunks

all_chunks = []

for doc in docs:
    chunks = chunk_text(doc["text"])
    subsection = doc["subsection_title"]

    # stable id prefix
    sec_key = doc["section_title"] if subsection is None else f"{doc['section_title']}::{subsection}"

    for i, chunk in enumerate(chunks):
        all_chunks.append({
            "id": f"{doc['doc_id']}::{sec_key}::{i}",
            "text": chunk,
            "metadata": {
                "doc_id": doc["doc_id"],
                "doc_title": doc["doc_title"],
                "url": doc["url"],
                "domain": doc["domain"],
                "effective_date": doc["effective_date"],
                "section_title": doc["section_title"],
                "subsection_title": doc["subsection_title"],
                "chunk_index": i,
            }
        })

print("Total chunks:", len(all_chunks))
all_chunks[0]

Total chunks: 106


{'id': 'housing_policy_2025::Emergency Procedures::Accidents::0',
 'text': 'Contact the nearest staff member or the main office for assistance. Staff will arrange ambulance transportation and other needed help. Dial 9-1-1 from any campus telephone for immediate contact with the Purdue Police.',
 'metadata': {'doc_id': 'housing_policy_2025',
  'doc_title': 'University Residences Guidelines and Policies',
  'url': 'https://www.housing.purdue.edu/campus-living/resources/guidelines-policies/ur-guidelines.html',
  'domain': 'housing',
  'effective_date': '2025-12-15',
  'section_title': 'Emergency Procedures',
  'subsection_title': 'Accidents',
  'chunk_index': 0}}

In [7]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_INDEX_NAME")

existing = [i["name"] for i in pc.list_indexes()]
if index_name not in existing:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)
print("Index ready:", index_name)

Index ready: boilercheck-rag


In [10]:
# Vector embedding + upsert block (Pinecone-safe metadata)
import os
import time
import hashlib
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

load_dotenv()

INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "purdue-policy-index-v3")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise RuntimeError("Missing PINECONE_API_KEY in .env")

EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
DIM = 384
REGION = "us-east-1"
BATCH_SIZE = 100

def make_chunk_id(doc_id: str, section_title: str, subsection_title: str, chunk_index: int) -> str:
    # subsection_title should already be a string ("" if missing)
    key = f"{doc_id}|{section_title}|{subsection_title}|{chunk_index}"
    h = hashlib.sha1(key.encode("utf-8")).hexdigest()
    return f"{doc_id}__{h}"

def batched(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

if "all_chunks" not in globals():
    raise RuntimeError("Expected `all_chunks` to exist. Run your chunking cell first.")

print("Chunks to embed:", len(all_chunks))

# Init Pinecone + create index if needed
pc = Pinecone(api_key=PINECONE_API_KEY)

existing = [i["name"] for i in pc.list_indexes()]
if INDEX_NAME not in existing:
    pc.create_index(
        name=INDEX_NAME,
        dimension=DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region=REGION),
    )
    print("Created index:", INDEX_NAME)

index = pc.Index(INDEX_NAME)
print("Index ready:", INDEX_NAME)

# Load embedding model on CPU
model = SentenceTransformer(EMBED_MODEL_NAME, device="cpu")
print("Embedding model loaded:", EMBED_MODEL_NAME)

total = 0
t0 = time.time()

for batch in batched(all_chunks, BATCH_SIZE):
    vectors = []

    for c in batch:
        md = dict(c["metadata"])  # copy

        # Ensure Pinecone-safe metadata types (no None/null)
        md["subsection_title"] = md.get("subsection_title") or ""   # null -> ""
        md["section_title"] = md.get("section_title") or ""         # defensive
        md["doc_id"] = md.get("doc_id") or ""
        md["doc_title"] = md.get("doc_title") or ""
        md["url"] = md.get("url") or ""
        md["domain"] = md.get("domain") or ""
        md["effective_date"] = md.get("effective_date") or ""

        # Ensure chunk_index is a number
        md["chunk_index"] = int(md.get("chunk_index", 0))

        # Build stable ASCII id
        vec_id = make_chunk_id(md["doc_id"], md["section_title"], md["subsection_title"], md["chunk_index"])

        emb = model.encode(c["text"]).tolist()

        # Add a readable key for citations
        source_key = md["section_title"] if md["subsection_title"] == "" else f"{md['section_title']} > {md['subsection_title']}"

        vectors.append({
            "id": vec_id,
            "values": emb,
            "metadata": {
                **md,
                "text": c["text"],          # string OK
                "source_key": source_key,   # string OK
            },
        })

    index.upsert(vectors=vectors)
    total += len(vectors)
    print(f"Upserted so far: {total} (elapsed {time.time() - t0:.1f}s)")

print("Done. Total upserted:", total)
print("Index stats:", index.describe_index_stats())

Chunks to embed: 106
Index ready: boilercheck-rag


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding model loaded: all-MiniLM-L6-v2
Upserted so far: 100 (elapsed 1.7s)
Upserted so far: 106 (elapsed 2.0s)
Done. Total upserted: 106
Index stats: {'_response_info': {'raw_headers': {'connection': 'keep-alive',
                                    'content-length': '185',
                                    'content-type': 'application/json',
                                    'date': 'Tue, 24 Feb 2026 00:35:18 GMT',
                                    'grpc-status': '0',
                                    'server': 'envoy',
                                    'x-envoy-upstream-service-time': '38',
                                    'x-pinecone-request-latency-ms': '37',
                                    'x-pinecone-response-duration-ms': '48'}},
 'dimension': 384,
 'index_fullness': 0.0,
 'memoryFullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'__default__': {'vector_count': 106}},
 'storageFullness': 0.0,
 'total_vector_count': 106,
 'vector_type': 'dense'}


In [11]:
query = "Are portable refrigerators allowed in University Residences rooms?"

q_emb = model.encode(query).tolist()
res = index.query(vector=q_emb, top_k=5, include_metadata=True)

for i, m in enumerate(res["matches"], start=1):
    md = m["metadata"]
    print("\n---", i)
    print("Score:", round(m["score"], 4))
    print("Source:", md.get("doc_title", ""))
    print("Where:", md.get("source_key", ""))
    print("Text:", md.get("text", "")[:350], "...")
    print("URL:", md.get("url", ""))


--- 1
Score: 0.715
Source: University Residences Guidelines and Policies
Where: Safety > Items Prohibited
Text: Items prohibited from University Residences buildings and grounds due to safety or fire hazard concerns include but are not limited to: electric/battery or gasoline motorized cycles, scooters, hoverboards or similar equipment; electric blankets or electric bed warmers of any kind; grow lights; halogen lamps; soldering tools; sun lamps; paintball gu ...
URL: https://www.housing.purdue.edu/campus-living/resources/guidelines-policies/ur-guidelines.html

--- 2
Score: 0.6791
Source: University Residences Guidelines and Policies
Where: Student Conduct Policies > Room Appliances and Furnishings
Text: Allowable cooking appliance use in rooms is limited to: coffee/tea makers with enclosed heating elements, portable microwave ovens (1000 watts or less), and portable refrigerators (UL approved, maximum 4.0 cu ft). Only one refrigerator and one microwave are permitted per room, and they

In [17]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1",
    default_headers={
        "HTTP-Referer": "http://localhost:3000",  # any URL is fine
        "X-Title": "Purdue Policy RAG MVP",
    },
)

def format_context(matches, max_chars=900):
    blocks = []
    for i, m in enumerate(matches, start=1):
        md = m["metadata"]

        section_path = md["section_title"]
        if md.get("subsection_title"):
            section_path += f" > {md['subsection_title']}"

        text = md.get("text", "")
        if len(text) > max_chars:
            text = text[:max_chars] + "..."

        blocks.append(
            f"""
SOURCE {i}
Title: {md.get('doc_title','')}
Section: {section_path}
URL: {md.get('url','')}
CitationID: [{md.get('url','')}#{section_path}]

{text}
"""
        )
    return "\n".join(blocks)

def kimi_rag_answer(query, top_k=6):
    q_emb = model.encode(query).tolist()
    res = index.query(vector=q_emb, top_k=top_k, include_metadata=True)

    context = format_context(res["matches"])

    prompt = f"""
You are a Purdue policy assistant.

Use ONLY the provided sources.

CITATION RULES:
- Cite using the CitationID provided in each source.
- Always include the full site citation like:
  [https://...#Section Name]

If the answer is not in the sources, say you cannot confirm.

SOURCES:
{context}

QUESTION:
{query}
"""

    response = client.chat.completions.create(
    model="openrouter/free",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.2,
    )

    print(response.choices[0].message.content)

# test
kimi_rag_answer("Are portable refrigerators allowed in University Residences rooms?")

Yes, portable refrigerators (UL approved, maximum 4.0 cu ft) are allowed in University Residences rooms [https://www.housing.purdue.edu/campus-living/resources/guidelines-policies/ur-guidelines.html#Student Conduct Policies > Room Appliances and Furnishings].
