In [159]:
# imports
from pathlib import Path
import json
import re
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [160]:
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL = "google/flan-t5-base"
COLLECTION = "campus_dummy"
TOP_K = 3

In [161]:
# load models
embedder = SentenceTransformer(EMBED_MODEL)

tok = AutoTokenizer.from_pretrained(GEN_MODEL)
gen  = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL)

In [162]:
# set up simple in-memory store (no external DB). We'll compute embeddings
# for all docs and perform cosine similarity search with NumPy.
index_ready = False

In [163]:
# input dummy data
docs, ids, metadatas = [], [], []
with open("dummy_corpus.jsonl", "r") as f:
    for i, line in enumerate(f):
        obj = json.loads(line)
        docs.append(obj["text"])
        ids.append(obj["id"])
        metadatas.append({"title": obj["title"], "url": obj["url"]})

In [164]:
# compute the embeddings (torch tensors to avoid numpy dependency inside Torch)
embs = embedder.encode(
    docs,
    normalize_embeddings=True,
    convert_to_numpy=False,
    convert_to_tensor=True
)
# ensure embeddings are on CPU for similarity and indexing
embs = embs.detach().cpu()
# mark index as ready
index_ready = True

In [165]:
# style guide and few-shot example
STYLE_GUIDE = """\
You are Campus Assistant. Answer in one short sentence, present tense, and conversational tone.
Do not hedge (avoid "it seems", "likely"). Do not include citations in the sentence.
If a date exists, write it as "January 12" (Month Day). If semester context appears in sources, include it.
If the question begins with 'why', state one clear reason from the sources.
Example:
Q: When is the Add/Drop deadline?
A: The add/drop deadline is January 12 for the Spring semester.
"""


FEW_SHOT = """\
Q: Where do I find advising?
A: You can meet with advising in the Academic Advising Center during posted hours.
"""

In [166]:
# regex patterns for rule-based answer generation
DATE_PAT = re.compile(r"\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|"
                      r"Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|"
                      r"Dec(?:ember)?)\s+\d{1,2}\b", re.IGNORECASE)
SEMESTER_PAT = re.compile(r"\b(Spring|Summer|Fall|Winter)\b", re.IGNORECASE)
ADD_DROP_PAT = re.compile(r"(add/?drop|drop/?add)", re.IGNORECASE)
ADVISING_PAT = re.compile(r"\b(advis(e|ing)|advisor|advisors?)\b", re.IGNORECASE)

In [167]:
# helpers

# retrieve top-k relevant documents
def retrieve(query: str, k: int = TOP_K):
    q_emb = embedder.encode(
        [query],
        normalize_embeddings=True,
        convert_to_numpy=False,
        convert_to_tensor=True
    )[0].detach().cpu()
    # cosine similarity since embeddings are normalized
    sims = (embs @ q_emb).detach().cpu()
    k = min(k, sims.shape[0])
    values, indices = torch.topk(sims, k)
    hits = []
    for rank, (score, i) in enumerate(zip(values.tolist(), indices.tolist())):
        hits.append({
            "id": ids[int(i)],
            "text": docs[int(i)],
            "metadata": metadatas[int(i)],
            "score": float(score)
        })
    return hits
# build prompt from question and contexts (black-box)
def build_prompt(question: str, contexts: list[dict], max_ctx_chars: int = 1200) -> str:
    ctx = []
    used = 0
    for i, c in enumerate(contexts):
        t = f"[{i+1}] {c['text']}".strip()
        if used + len(t) > max_ctx_chars:
            break
        ctx.append(t)
        used += len(t)

    ctx_block = "\n\n".join(ctx)
    prompt = (
        STYLE_GUIDE + "\n" +
        FEW_SHOT + "\n" +
        "Answer ONLY using the information in these sources. If unsure, say: "
        "\"I’m not sure based on the sources provided.\""
        "\n\n"
        f"Question: {question}\n\n"
        f"Sources:\n{ctx_block}\n\n"
        "Answer:"
    )
    return prompt

def confidence_label(best_score: float) -> str:
    if best_score >= 0.60: return "High"
    if best_score >= 0.40: return "Medium"
    return "Low"

def try_rule_assist_advising(question: str, contexts: list[dict]):
    if not contexts:
        return None
    top = contexts[0]["text"]
    if ADVISING_PAT.search(question) or ADVISING_PAT.search(top):
        m = re.search(r"should (.+?)(?:\.|$)", top, re.IGNORECASE)
        if m:
            reason = m.group(1).strip()
            return f"You should meet with advising to {reason}."
        return "You should meet with advising to plan your classes before the registration window opens."
    return None

# generate answer from prompt
def generate_answer(question: str):
    contexts = retrieve(question, k=TOP_K)

    # compute best score first (for logging + confidence label)
    best_score = contexts[0]["score"] if contexts else 0.0
    print(f"[debug] best_score={best_score:.3f}")
    conf = confidence_label(best_score)

    # try rule-based answers first (so they're not blocked by low-confidence)
    rule_ans = try_rule_assist(question, contexts) or try_rule_assist_advising(question, contexts)
    if rule_ans:
        ans = rule_ans.strip()
        if not ans.endswith("."): ans += "."
        return {
            "question": question,
            "answer": ans,
            "retrieved": contexts,
            "confidence": conf,
            "citations": [{"title": c["metadata"]["title"], "url": c["metadata"]["url"]} for c in contexts]
        }

    # low-confidence guard (after rules)
    LOW_CONF_THRESHOLD = 0.35  # relaxed for small corpora
    if best_score < LOW_CONF_THRESHOLD:
        return {
            "question": question,
            "answer": "I’m not sure based on the sources provided. Please check the official registrar page.",
            "retrieved": contexts,
            "confidence": conf,
            "citations": [{"title": c["metadata"]["title"], "url": c["metadata"]["url"]} for c in contexts]
        }


    if rule_ans:
        ans = rule_ans.strip()
        if not ans.endswith("."): ans += "."
        return {
            "question": question,
            "answer": ans,
            "retrieved": contexts,
            "confidence": conf,
            "citations": [{"title": c["metadata"]["title"], "url": c["metadata"]["url"]} for c in contexts]
        }

    # model-backed answer
    device = "cuda" if torch.cuda.is_available() else "cpu" #gpu's on colab if needed
    gen.to(device)
    prompt = build_prompt(question, contexts)
    inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)
    outputs = gen.generate(
        **inputs,
        max_new_tokens=512,
        num_beams=1,
        do_sample=False,
        length_penalty=0.1,
        early_stopping=True
    )
    answer = tok.decode(outputs[0], skip_special_tokens=True).strip()
    if not answer.endswith("."):
        answer += "."

    return {
        "question": question,
        "answer": answer,
        "retrieved": contexts,
        "confidence": conf,
        "citations": [{"title": c["metadata"]["title"], "url": c["metadata"]["url"]} for c in contexts]
    }

In [173]:
# so black-box baseline is query -> retrieve -> generate_answer
if __name__ == "__main__":
    q = "How do I know what classes to take?"
    out = generate_answer(q)
    print("Q:", out["question"])
    print("A:", out["answer"])
    print("\n-- Retrieved Contexts --")
    for i, c in enumerate(out["retrieved"], 1):
        print(f"[{i}] {c['metadata']['title']} | {c['metadata']['url']}")
        print("   ", c["text"])

[debug] best_score=0.335
Q: How do I know what classes to take?
A: You should meet with advising to meet advisors to plan classes before the registration window opens.

-- Retrieved Contexts --
[1] Advising FAQ | https://example.edu/advising
    Students should meet advisors to plan classes before the registration window opens.
[2] Financial Aid | https://example.edu/aid
    FAFSA priority filing date is November 1.
[3] Registrar Deadlines | https://example.edu/registrar
    Drop/add ends January 12 for the Spring term.
