### Prepare a Held out test dataset

In [None]:
from datasets import load_dataset, DatasetDict

# Load MedQuAD dataset from Hugging Face
medquad = load_dataset("medquad")

# Splitting into training (90%) and held-out test set (10%)
medquad = medquad['train'].train_test_split(test_size=0.1, seed=42)

# Save the splits
dataset_dict = DatasetDict({
    'train': medquad['train'],
    'test': medquad['test']
})

dataset_dict.save_to_disk("medquad_dataset")


### Model Inference

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_from_disk

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer
model_name = "your-trained-model-name"  # Replace with your model path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Load test set
test_set = load_from_disk("medquad_dataset")["test"]

def generate_answer(question):
    inputs = tokenizer.encode(question, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs,
        max_length=256,
        do_sample=False
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate predictions
predictions = []
references = []

for example in test_set:
    question = example['question']
    reference_answer = example['answer']

    predicted_answer = generate_answer(question)

    predictions.append(predicted_answer)
    references.append(reference_answer)

# Save predictions and references
import json
with open("test_predictions.json", "w") as f:
    json.dump({"predictions": predictions, "references": references}, f)


### Evaluation Metrics

In [None]:
import json
import evaluate

# Load predictions and references
with open("test_predictions.json", "r") as f:
    data = json.load(f)

predictions = data["predictions"]
references = data["references"]

# Compute BLEU
bleu = evaluate.load("bleu")
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])

# Compute ROUGE-L
rouge = evaluate.load("rouge")
rouge_score = rouge.compute(predictions=predictions, references=references, rouge_types=["rougeL"])

print(f"BLEU Score: {bleu_score['bleu']:.4f}")
print(f"ROUGE-L Score: {rouge_score['rougeL']:.4f}")


### ML Flow

In [None]:
import mlflow

mlflow.set_experiment("Medical-Chatbot-Evaluation")

with mlflow.start_run():
    mlflow.log_metric("BLEU", bleu_score['bleu'])
    mlflow.log_metric("ROUGE-L", rouge_score['rougeL'])
    mlflow.log_artifact("test_predictions.json")

## Human Judgement

In [None]:
# ---------------------------------------------------------
# Part 2 – Sanity Checks with Human Judgment & Explainable AI
# ---------------------------------------------------------
#
# Prereqs:
#   pip install datasets transformers torch nltk lime evaluate
# ---------------------------------------------------------

import json, random, torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM
from lime.lime_text import LimeTextExplainer

# ---------- 1.  Load held‑out test set & model ----------

DATA_DIR   = "medquad_dataset"          # from Step 1
MODEL_NAME = "your-trained-model-name"  # path or HF repo id

test_set  = load_from_disk(DATA_DIR)["test"]
tok       = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForCausalLM.from_pretrained(MODEL_NAME).eval().to(
              torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# ---------- 2.  Generate answers on the full test set ----------

def answer(q: str, max_len: int = 256) -> str:
    ids = tok.encode(q, return_tensors="pt").to(model.device)
    out = model.generate(ids, max_length=max_len, do_sample=False)
    return tok.decode(out[0], skip_special_tokens=True)

preds, refs = [], []
for ex in test_set:
    preds.append(answer(ex["question"]))
    refs.append(ex["answer"])

with open("test_predictions.json", "w") as f:
    json.dump({"predictions": preds, "references": refs}, f, indent=2)

# ---------- 3.  Sample ~50 Q‑A pairs for human review ----------

SAMPLE_N = 50
idxs = random.sample(range(len(preds)), SAMPLE_N)
sampled = [{"question": refs[i], "answer": preds[i]} for i in idxs]

with open("human_review_samples.json", "w") as f:
    json.dump(sampled, f, indent=2)

# ---------- 4.  Local interpretability with LIME ----------

explainer = LimeTextExplainer(class_names=list(tok.get_vocab().keys()))

def proba_func(questions: list[str]):
    outs = []
    for q in questions:
        ids   = tok(q, return_tensors="pt").to(model.device)
        logits = model(**ids, labels=ids["input_ids"]).logits[:, -1, :]
        outs.append(torch.softmax(logits, dim=-1).cpu().numpy().ravel())
    return outs

sample_q = "What are the early signs of stroke?"
lime_exp = explainer.explain_instance(
    sample_q,
    proba_func,
    num_features=10,
    num_samples=100
)

lime_exp.save_to_file("lime_explanation.html")  # open in browser
print(lime_exp.as_list())                       # console preview


### Template Based tests

In [None]:
#### Make paraphrase sets
# -----------------------------------------------
# make_paraphrase_sets.py
# -----------------------------------------------
import json, re, random
from pathlib import Path
from itertools import islice

random.seed(42)

# 1️⃣  Canonical questions you care about
SEED_QUESTIONS: dict[str, str] = {
    "common_cold_tx":     "How is the common cold treated?",
    "diabetes_definition": "What is diabetes?",
    "asthma_symptoms":    "What are the symptoms of asthma?",
    "stroke_warning":     "What are the warning signs of a stroke?",
}

# 2️⃣  Simple paraphrase helpers
OPENING_TEMPLATES = [
    "How do doctors usually {}?",
    "Can you explain {}?",
    "Whats the standard {}?",
    "Could you tell me {}?",
    "Please describe {}.",
]

VERB_REPLACEMENTS = {
    "treated": ["managed", "handled", "cared for"],
    "defined": ["explained", "described"],
}

def paraphrase(sentence: str, n: int = 3) -> list[str]:
    """
    Very lightweight paraphrasing:
      * swap verbs with synonyms
      * vary opening phrases
    """
    sentence = sentence.rstrip("?")
    words = sentence.split()

    # verb‑level replacements
    s2 = sentence
    for verb, reps in VERB_REPLACEMENTS.items():
        if verb in sentence:
            s2_list = []
            for r in reps:
                s2_list.append(re.sub(rf"\b{verb}\b", r, sentence))
            words += s2_list
            break  # one replacement per Q

    # opening templates
    root = re.sub(r"^(what|how|can you|could you|please|whats)\s+", "", sentence, flags=re.I)
    templated = [tmpl.format(root.lower()) for tmpl in OPENING_TEMPLATES]

    # choose *n* unique paraphrases
    paraphrases = list({*words, *templated})
    return list(islice(paraphrases, n))

# 3️⃣  Build the structure
out = []
for qid, canonical in SEED_QUESTIONS.items():
    out.append(
        {
            "id": qid,
            "canonical": canonical,
            "paraphrases": paraphrase(canonical, n=4),
        }
    )

# 4️⃣  Write file
Path("templates").mkdir(exist_ok=True)
out_path = Path("templates/paraphrase_sets.json")
out_path.write_text(json.dumps(out, indent=2))
print(f"✏️  Wrote {out_path} with {len(out)} template sets.")


Run `pytest -q`



### Slice of Interest

Dataset Tagging

In [None]:
# scripts/build_slice_cache.py
from datasets import load_from_disk
import json, numpy as np, evaluate

DATA_DIR = "medquad_dataset"            # path to train/test split
TEST_SPLIT = load_from_disk(DATA_DIR)["test"]

bleu = evaluate.load("bleu")

slice_scores = {}   # mapping slice_name -> list[(pred, ref)]

with open("test_predictions.json") as f:       # generated in earlier step
    preds_refs = json.load(f)

for ex, pred, ref in zip(TEST_SPLIT, preds_refs["predictions"], preds_refs["references"]):
    key = ex["question_type"].replace(" ", "_")  # normalize spaces
    slice_scores.setdefault(key, {"pred": [], "ref": []})
    slice_scores[key]["pred"].append(pred)
    slice_scores[key]["ref"].append(ref)

# compute BLEU per slice
slice_bleu = {}
for key, d in slice_scores.items():
    slice_bleu[key] = bleu.compute(
        predictions=d["pred"],
        references=[[r] for r in d["ref"]]
    )["bleu"]

# dump for pytest
with open("templates/slice_bleu.json", "w") as f:
    json.dump(slice_bleu, f, indent=2)

print("Wrote templates/slice_bleu.json with", len(slice_bleu), "slices")


Run pytest-q with test_slices

In [1]:
##Made a pytest suite for all the tests just make sure the json files are made and ready