In [None]:
!pip install datasets sentence-transformers faiss-cpu langchain openai
!pip install sacremoses
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np




In [None]:
from datasets import load_dataset

dataset = load_dataset("KryptoniteCrown/synthetic-neurology-QA-dataset", split = "train")

neurology_qa = [{"question": row["question"], "answer": row["answer"]} for row in dataset]

print(neurology_qa[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'question': 'What are the key characteristics of deep dyslexia?', 'answer': 'Deep dyslexia is characterized by both semantic paralexia (paralexia) and phonological dyslexia. Individuals with this condition may read a word and substitute it with a semantically related word. It involves significant phonological processing impairments and is often caused by extensive brain damage, frequently in the left hemisphere.'}


In [None]:
cardiology_dataset = load_dataset("ilyassacha/cardiology_qa", split="train")
cardiology_qa = [{"question": row["question"], "answer": row["answer"]} for row in cardiology_dataset]
print("Cardiology Q&A samples:", len(cardiology_qa))
print(cardiology_qa[0])

Cardiology Q&A samples: 14885
{'question': 'My husband of 82 years is on multiple medications for heart disease and Parkensons. His feet are very swollen ...is this because he is retaining fluids and if so what can he do to relieve this. He is under the care of several doctors but they have not offered or explained or shed any light on the cause. margaretmidea@ icloud.com', 'answer': 'hi, thank-you for the brief history.to reduce the retention there should be some muscular activity. it will be active or passive. a physical therapist will play a key role. also keeping the legs elevated will be good for reducing some swelling. use lower limb stockings for compression effect. as due to lack of mobility in the muscular system this is happening. i wish you find a physical therapist who can take the responsibility further and help your husband to the best.  with the grace of god i wish you a speedy recoveryregardsjay in chatbot.'}


In [None]:
dermatology_dataset = load_dataset("Mreeb/Dermatology-Question-Answer-Dataset-For-Fine-Tuning", split="train")
dermatology_qa = [{"question": row["prompt"], "answer": row["response"]} for row in dermatology_dataset]
print("Dermatology Q&A samples:", len(dermatology_qa))
print(dermatology_qa[0])

Dermatology Q&A samples: 1460
{'question': 'What is psoriasis and what are its common symptoms?', 'answer': 'Psoriasis is a chronic autoimmune condition that results in the overproduction of skin cells. This overproduction leads to patches of thick, red skin covered with silvery scales. Common symptoms include red patches of skin covered with thick, silvery scales, small scaling spots (commonly seen in children), dry and cracked skin that may bleed, itching, burning, or soreness, thickened, pitted, or ridged nails, and swollen and stiff joints.'}


In [None]:
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def build_faiss_index(docs, domain_name=""):
    texts = [d["question"] for d in docs]
    embeddings = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    # Print vector count for this domain
    print(f"[{domain_name}] Total vectors stored: {index.ntotal}")

    return index, docs

# Build indexes for each dataset
neurology_index, neurology_docs = build_faiss_index(neurology_qa, "Neurology")
cardio_index, cardio_docs = build_faiss_index(cardiology_qa, "Cardiology")
derma_index, derma_docs = build_faiss_index(dermatology_qa, "Dermatology")

# Store in dictionary
vector_dbs = {
    "Neurology": (neurology_index, neurology_docs),
    "cardiology": (cardio_index, cardio_docs),
    "dermatology": (derma_index, derma_docs),
}

# Print a final summary
print("\nSummary of Vector DBs:")
for domain, (index, docs) in vector_dbs.items():
    print(f" - {domain.capitalize()}: {index.ntotal} vectors")


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

[Neurology] Total vectors stored: 1452


Batches:   0%|          | 0/466 [00:00<?, ?it/s]

[Cardiology] Total vectors stored: 14885


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

[Dermatology] Total vectors stored: 1460

Summary of Vector DBs:
 - Neurology: 1452 vectors
 - Cardiology: 14885 vectors
 - Dermatology: 1460 vectors


In [None]:
def route_pipeline(query: str):
    q = query.lower()

    cardiology_keywords = [
    # 1. Cardiovascular Diseases and Conditions
    "heart disease", "cardiovascular disease", "coronary artery disease", "cad",
    "myocardial infarction", "heart attack", "angina", "unstable angina",
    "stable angina", "heart failure", "congestive heart failure", "chf",
    "arrhythmia", "atrial fibrillation", "afib", "ventricular tachycardia",
    "vt", "bradycardia", "tachycardia", "cardiomyopathy", "hypertrophic cardiomyopathy",
    "dilated cardiomyopathy", "restrictive cardiomyopathy", "pericarditis",
    "endocarditis", "myocarditis", "valvular disease", "aortic stenosis",
    "mitral regurgitation", "aortic regurgitation", "mitral stenosis",
    "pulmonary hypertension", "systemic hypertension", "high blood pressure",
    "low blood pressure", "hypotension", "cor pulmonale", "cardiac arrest",
    "sudden cardiac death", "ischemic heart disease", "atherosclerosis",
    "hyperlipidemia", "dyslipidemia", "high cholesterol",

    # 2. Heart Anatomy and Physiology
    "heart", "aorta", "atrium", "ventricle", "mitral valve", "aortic valve",
    "tricuspid valve", "pulmonary valve", "coronary arteries", "left ventricle",
    "right ventricle", "left atrium", "right atrium", "septum", "endocardium",
    "myocardium", "pericardium", "sinus node", "sa node", "av node", "bundle of his",
    "purkinje fibers", "blood vessel", "artery", "vein", "capillary",

    # 3. Diagnostic Tests and Imaging
    "ecg", "ekg", "electrocardiogram", "echocardiogram", "echo", "stress test",
    "treadmill test", "angiogram", "cardiac catheterization", "coronary angiography",
    "ct angiography", "mri heart", "holter monitor", "event monitor",
    "cardiac mri", "cardiac ct", "nuclear stress test", "tilt table test",
    "blood pressure monitor", "troponin", "cardiac enzymes",

    # 4. Treatments, Procedures, and Medications
    "angioplasty", "stent", "coronary stent", "bypass surgery", "cabg",
    "pacemaker", "implantable cardioverter defibrillator", "icd",
    "ablation", "valve replacement", "valve repair", "heart transplant",
    "cardioversion", "defibrillation", "thrombolysis", "anticoagulant",
    "warfarin", "heparin", "aspirin", "beta blocker", "ace inhibitor",
    "arb", "statin", "calcium channel blocker", "diuretic", "nitrate",
    "vasodilator", "antiplatelet", "antihypertensive", "digoxin",
    "clopidogrel", "tpa", "nitroglycerin",

    # 5. Symptoms and Clinical Signs
    "chest pain", "palpitations", "shortness of breath", "dyspnea",
    "fatigue", "syncope", "fainting", "edema", "swelling", "cyanosis",
    "orthopnea", "paroxysmal nocturnal dyspnea", "leg swelling",
    "dizziness", "lightheadedness", "heart murmur",

    # 6. Risk Factors and Related Terms
    "smoking", "obesity", "diabetes", "high cholesterol", "hypercholesterolemia",
    "hypertension", "stress", "family history", "age", "male gender",
    "sedentary lifestyle", "poor diet", "alcohol", "metabolic syndrome",

    # 7. Subspecialties and Related Fields
    "interventional cardiology", "electrophysiology", "cardiac surgery",
    "preventive cardiology", "nuclear cardiology", "pediatric cardiology",
    "vascular medicine", "cardiac rehabilitation", "heart rhythm disorders",

    # 8. Acronyms and Common Abbreviations
    "cad", "chf", "afib", "vt", "ecg", "ekg", "cabg", "icd", "sa node",
    "av node", "lv", "rv", "bp", "hdl", "ldl", "tpa", "murmur"
]


    dermatology_keywords = [
    # 1. Common Skin Diseases and Conditions
    "acne", "rosacea", "eczema", "dermatitis", "atopic dermatitis", "contact dermatitis",
    "seborrheic dermatitis", "psoriasis", "urticaria", "hives", "alopecia", "hair loss",
    "vitiligo", "melasma", "hyperpigmentation", "hypopigmentation", "lupus",
    "scleroderma", "fungal infection", "tinea", "ringworm", "athlete's foot", "onychomycosis",
    "nail fungus", "warts", "verruca", "molluscum contagiosum", "impetigo", "cellulitis",
    "abscess", "boil", "carbuncle", "hidradenitis suppurativa", "scabies", "lice",
    "shingles", "herpes zoster", "cold sore", "herpes simplex", "chickenpox",
    "basal cell carcinoma", "squamous cell carcinoma", "melanoma", "skin cancer",
    "actinic keratosis", "sunburn", "contact allergy", "drug eruption", "blister",
    "rash", "itching", "pruritus", "hives", "dry skin", "seborrhea", "corn", "callus",
    "eczema herpeticum", "lichen planus", "pityriasis rosea", "dermatophytosis",

    # 2. Skin Anatomy and Structures
    "skin", "epidermis", "dermis", "subcutaneous tissue", "sweat gland", "sebaceous gland",
    "hair follicle", "nail", "melanocyte", "keratinocyte", "collagen", "elastin",
    "pores", "sebaceous glands", "oil gland",

    # 3. Diagnostic Tests and Procedures
    "skin biopsy", "dermoscopy", "woods lamp", "skin scraping", "patch test",
    "skin culture", "allergy test", "trichoscopy", "nail clipping test",
    "histopathology", "microscopy",

    # 4. Treatments, Procedures, and Medications
    "topical steroid", "corticosteroid", "antifungal cream", "antibiotic ointment",
    "moisturizer", "emollient", "retinoid", "benzoyl peroxide", "isotretinoin",
    "accutane", "phototherapy", "laser therapy", "cryotherapy", "electrocautery",
    "skin graft", "chemical peel", "microdermabrasion", "botox", "filler",
    "antihistamine", "antiviral cream", "steroid injection", "coal tar",
    "calcineurin inhibitor", "tacrolimus", "pimecrolimus", "zinc oxide",
    "salicylic acid", "sunscreen", "spf", "uv protection", "hydration cream",

    # 5. Symptoms and Clinical Signs
    "rash", "redness", "itching", "pruritus", "swelling", "blister", "pustule",
    "papule", "nodule", "plaque", "scaling", "flaking", "dryness", "peeling",
    "lesion", "ulcer", "erosion", "crust", "scab", "pigmentation", "spot",
    "mole", "nevus", "lump", "bump", "discoloration", "burn", "scar", "keloid",

    # 6. Cosmetic and Aesthetic Dermatology
    "acne scars", "hyperpigmentation", "chemical peel", "laser resurfacing",
    "anti-aging", "wrinkle", "botox", "filler", "skin rejuvenation",
    "microneedling", "dermal filler", "facial treatment", "skin whitening",
    "pigmentation removal", "tattoo removal",

    # 7. Subspecialties and Related Areas
    "cosmetic dermatology", "pediatric dermatology", "surgical dermatology",
    "immunodermatology", "trichology", "dermatopathology", "venereology",

    # 8. Acronyms and Common Abbreviations
    "spf", "bsa", "uv", "uvb", "uva", "nmsc", "bcc", "scc", "bpo"
]


    neurology_keywords = [
    # 1. Neurological Conditions
    "stroke", "migraine", "epilepsy", "seizure", "multiple sclerosis", "ms",
    "parkinson", "parkinson's", "alzheimer", "dementia", "huntington", "als",
    "lou gehrig", "neuropathy", "neuralgia", "radiculopathy", "meningitis",
    "encephalitis", "brain tumor", "glioma", "astrocytoma", "concussion",
    "traumatic brain injury", "tbi", "spinal cord injury", "dystonia", "tremor",
    "headache", "cluster headache", "vertigo", "ataxia", "neuropathic pain",
    "sciatica",

    # 2. Brain and Nervous System Anatomy
    "brain", "spinal cord", "neuron", "nerve", "synapse", "axon", "dendrite",
    "cortex", "cerebellum", "brainstem", "medulla", "thalamus", "hypothalamus",
    "basal ganglia", "frontal lobe", "temporal lobe", "parietal lobe",
    "occipital lobe", "cerebrum", "meninges", "peripheral nervous system",
    "central nervous system", "cns", "pns",

    # 3. Diagnostic Tests and Imaging
    "mri", "ct scan", "eeg", "emg", "nerve conduction study", "lumbar puncture",
    "spinal tap", "brain scan", "neuroimaging", "pet scan", "spect",
    "angiography", "evoked potentials",

    # 4. Treatments, Procedures, and Medications
    "deep brain stimulation", "dbs", "neurorehabilitation", "physiotherapy",
    "antiepileptic", "levodopa", "dopamine agonist", "anticonvulsant",
    "neurologist", "neurosurgery", "neurofeedback", "botulinum toxin",
    "tpa", "thrombolysis",

    # 5. Symptoms and Clinical Signs
    "numbness", "tingling", "weakness", "paralysis", "memory loss", "confusion",
    "tremor", "balance problems", "speech difficulty", "dizziness", "fainting",
    "cognitive decline", "vision changes", "muscle spasm", "coordination loss",

    # 6. Related Subfields
    "neuroscience", "neurobiology", "neuropsychology", "cognitive neuroscience",
    "behavioral neurology", "neuroanatomy", "neurophysiology", "neuropharmacology",

    # 7. Acronyms and Common Abbreviations
    "cns", "pns", "ms", "als", "tbi", "ad", "pd", "eeg", "emg", "dbs",
    "mri", "ct", "lp", "csf", "cva"
]


    if any(word in q for word in cardiology_keywords):
        print("cardiology")
        return ["cardiology"]
    elif any(word in q for word in dermatology_keywords):
        print("dermatology")
        return ["dermatology"]
    else:
        print("Neurology")
        return ["Neurology"]


In [None]:
hyde_model_name = "microsoft/BioGPT"
hyde_tokenizer = AutoTokenizer.from_pretrained(hyde_model_name)
hyde_model = AutoModelForCausalLM.from_pretrained(hyde_model_name)

def hyde_hypothesis(query, max_new_tokens=160):
    """Generate a hypothetical answer to improve retrieval (HyDE)."""
    prompt = (
        "You are a senior medical expert. Given the following question, create a hypothetical but medically valid answer.\n"
        "Make it structured exactly like this:\n\n"
        "Causes:\n- <cause 1>\n- <cause 2>\n- <cause 3>\n\n"
        "Treatments:\n- <treatment 1>\n- <treatment 2>\n- <treatment 3>\n\n"
        "Follow-up:\n- <test or next step 1>\n- <test or next step 2>\n\n"
        "Summary:\n<2–3 sentence summary>\n\n"
        f"Question: {query}"
    )

    print("\n🧬 HyDE Model started running...")
    inputs = hyde_tokenizer(prompt, return_tensors="pt")
    input_len = inputs["input_ids"].shape[1]

    outputs = hyde_model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
        pad_token_id=hyde_tokenizer.eos_token_id
    )

    generated = outputs[0][input_len:]
    hypo_answer = hyde_tokenizer.decode(generated, skip_special_tokens=True).strip()

    print("\n=== 🧠 HyDE Hypothesis (Generated) ===")
    print(hypo_answer if hypo_answer else "⚠️ No valid output generated.")
    print("======================================\n")

    return hypo_answer if hypo_answer else query

In [None]:
reranker_model_name = "google/flan-t5-large"
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
reranker_model = AutoModelForSeq2SeqLM.from_pretrained(reranker_model_name)

def llm_rerank(query, candidate_answers):
    """Rank candidate answers using Flan-T5."""
    results = []
    for ans in candidate_answers:
        prompt = (
            f"Question: {query}\n\nCandidate answer:\n{ans}\n\n"
            "Rate how well this answer addresses the question (0.0–1.0):\n"
            "- 1.0 = complete and correct\n"
            "- 0.5 = partially correct\n"
            "- 0.0 = irrelevant\n"
            "Respond ONLY with the number."
        )
        print("🔍 LLM Re-ranker running...")
        inputs = reranker_tokenizer(prompt, return_tensors="pt", truncation=True)
        outputs = reranker_model.generate(**inputs, max_new_tokens=8)
        score_text = reranker_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
        try:
            score = float(re.findall(r"[0-1]\.?[0-9]*", score_text)[0])
        except Exception:
            score = 0.0
        results.append({"answer": ans, "score": score})

    results = sorted(results, key=lambda x: x["score"], reverse=True)
    print("\n=== 📊 Re-Ranker Scores ===")
    for r in results:
        print(f"Score: {r['score']:.2f} | {r['answer'][:90]}...")
    print("===========================\n")

    top = results[0]
    print(f"✅ Top answer selected (Score {top['score']:.2f})\n")
    return results, top



In [None]:
def synthesize_answer(query, passage):
    """Generate final structured answer using the best passage."""
    prompt = (
        "You are a senior medical doctor. Based on the passage, give a structured, patient-friendly answer.\n\n"
        f"Question: {query}\n\nPassage:\n{passage}\n\n"
        "Answer (strict format):\n\n"
        "Causes:\n- ...\n\n"
        "Treatments:\n- ...\n\n"
        "Follow-up:\n- ...\n\n"
        "Summary:\n..."
    )
    inputs = reranker_tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = reranker_model.generate(**inputs, max_new_tokens=400)
    return reranker_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

In [None]:
def fake_search(_index, _docs, query_text, k):
    """Fake retrieval (replace with FAISS search in real use)."""
    query_emb = embedder.encode([query_text])
    # Return all docs (simulate)
    return [d["answer"] for d in _docs[:k]]


In [None]:
def retrieve_answer(query, k=3, use_hyde=True):
    print("\n==============================")
    print(f"🩺 Query received: {query}")
    print("==============================")

    # Step 1: HyDE
    query_text = hyde_hypothesis(query) if use_hyde else query

    # Step 2: Route to relevant domain DB
    selected_dbs = route_pipeline(query)
    print(f"📚 Selected DBs: {selected_dbs}\n")

    # Step 3: Retrieve candidates (simulate or use FAISS)
    candidates = []
    for db_name in selected_dbs:
        index, docs = vector_dbs[db_name]
        candidates.extend(fake_search(index, docs, query_text, k))

    if not candidates:
        print("⚠️ No results retrieved.")
        return {"ranked_results": [], "final_answer": "No relevant medical information found."}

    # Step 4: Re-rank
    ranked, best = llm_rerank(query, candidates)

    # Step 5: Generate final answer
    final = synthesize_answer(query, best["answer"])

    print("\n🧾 === FINAL SYNTHESIZED ANSWER ===")
    print(final)
    print("==================================\n")

    return {"ranked_results": ranked, "final_answer": final}

In [None]:
query = "What is the best treatment for psoriasis?"
results = retrieve_answer(query, k=3, use_hyde=True)

print("Ranked Results:\n")
for r in results["ranked_results"]:
    print(f"Score: {r['reranker_score']:.2f} | Answer: {r['answer'][0 : ]}")

print("\nFinal Synthesized Answer:\n")
print(results["final_answer"])

HyDe Model started running...

=== HyDE Hypothesis (Generated) ===
You are a senior medical expert. Given the following question, create a hypothetical but medically valid answer. Make it structured in this exact format: Causes: - < cause 1 > - < cause 2 > - < cause 3 > Treatments: - < treatment 1 > - < treatment 2 > - < treatment 3 > Follow-up: - < test or next step 1 > - < test or next step 2 > Summary: < 2 3 sentence summary > Question: What is the best treatment for psoriasis?

dermatology
LLM Re-ranker started running...
LLM Re-ranker started running...
LLM Re-ranker started running...

=== Re-ranker Scores ===
Score: 1.00 | Psoriasis treatment aims to stop skin cells from growing so quickly and to remov...
Score: 1.00 | Treatment options for psoriasis depend on the severity and extent of the skin in...
Score: 1.00 | Psoriasis is a chronic skin disorder that speeds up the life cycle of skin cells...

Top-ranked answer selected (score 1.00) ✅

Ranked Results:

Score: 1.00 | Answer:

In [None]:
query = "I have been experiencing chest pain and irregular heartbeat, could this be related to my heart condition?"
results = retrieve_answer(query, k=3, use_hyde=True)

print("\nRanked Results:\n")
for r in results["ranked_results"]:
    print(f"Score: {r['score']:.2f} | Answer: {r['answer']}\n")

print("\nFinal Synthesized Answer:\n")
print(results["final_answer"])



🩺 Query received: I have been experiencing chest pain and irregular heartbeat, could this be related to my heart condition?

🧬 HyDE Model started running...

=== 🧠 HyDE Hypothesis (Generated) ===
⚠️ No valid output generated.

cardiology
📚 Selected DBs: ['cardiology']

🔍 LLM Re-ranker running...
🔍 LLM Re-ranker running...
🔍 LLM Re-ranker running...

=== 📊 Re-Ranker Scores ===
Score: 1.00 | hi, thank-you for the brief history.to reduce the retention there should be some muscular ...
Score: 1.00 | degree understand your concerns went through your details. i suggest you not to worry much...
Score: 0.00 | The effects of dimethylformamide exposure on liver and kidney function in the elderly popu...

✅ Top answer selected (Score 1.00)


🧾 === FINAL SYNTHESIZED ANSWER ===
I have been experiencing chest pain and irregular heartbeat, could this be related to my heart condition?


Ranked Results:

Score: 1.00 | Answer: hi, thank-you for the brief history.to reduce the retention there should be 

In [None]:
query = "How does left ventricular hypertrophy influence the prognosis of hypertensive patients?"
results = retrieve_answer(query, k=3, use_hyde=True)

print("Ranked Results:\n")
for r in results["ranked_results"]:
    print(f"Score: {r['reranker_score']:.2f} | Answer: {r['answer'][0 : ]}")

print("\nFinal Synthesized Answer:\n")
print(results["final_answer"])

In [None]:
query = "I have a 15 months baby. She has diarrhea for 10 days doesn t have fever. She is on diet, rice patato, carrot banana, and than the stool is ok. But when I try to give her other food than the diarrhea turns back. What can I do. I give her pills for diarrhea."
results = retrieve_answer(query, k=3, use_hyde=True)

print("Ranked Results:\n")
for r in results["ranked_results"]:
    print(f"Score: {r['reranker_score']:.2f} | Answer: {r['answer'][0 : ]}")

print("\nFinal Synthesized Answer:\n")
print(results["final_answer"])

HyDe Model started running...

=== HyDE Hypothesis (Generated) ===
You are a senior medical expert. Given the following question, create a hypothetical but medically valid answer. Make it structured in this exact format: Causes: - < cause 1 > - < cause 2 > - < cause 3 > Treatments: - < treatment 1 > - < treatment 2 > - < treatment 3 > Follow-up: - < test or next step 1 > - < test or next step 2 > Summary: < 2 3 sentence summary > Question: I have a 15 months baby. She has diarrhea for 10 days doesn t have fever. She is on diet, rice patato, carrot banana, and than the stool is ok. But when I try to give her other food than the diarrhea turns back. What can I do. I give her pills for diarrhea.

Neurology
LLM Re-ranker started running...
LLM Re-ranker started running...
LLM Re-ranker started running...

=== Re-ranker Scores ===
Score: 0.00 | When available, the recommended approach to eating difficulties in dementia pati...
Score: 0.00 | After the patient becomes aware of their difficult