In [2]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [3]:
base_dir = "C:\Task2_Medical_QA_Chatbot\data\MedQA_QA_pair"
questions, answers = [], []

In [4]:
for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.endswith(".xml"):
                file_path = os.path.join(folder_path, file)
                try:
                    tree = ET.parse(file_path)
                    root = tree.getroot()
                    question = root.find(".//Question").text
                    answer = root.find(".//Answer").text
                    if question and answer:
                        questions.append(question.strip())
                        answers.append(answer.strip())
                except:
                    continue


In [5]:
df = pd.DataFrame({'question': questions, 'answer': answers})
print("Total QA pairs:", len(df))
df.head()

Total QA pairs: 5482


Unnamed: 0,question,answer
0,What is (are) Adult Acute Lymphoblastic Leukem...,Key Points\n - Adult acute ...
1,What is (are) Adult Acute Myeloid Leukemia ?,Key Points\n - Adult acute ...
2,What is (are) Chronic Lymphocytic Leukemia ?,Key Points\n - Chronic lymp...
3,What is (are) Chronic Myelogenous Leukemia ?,Key Points\n - Chronic myel...
4,What is (are) Hairy Cell Leukemia ?,Key Points\n - Hairy cell l...


In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')
question_embeddings = model.encode(df['question'].tolist(), convert_to_tensor=True)

In [8]:
def get_best_answer(user_question, top_k=1):
    user_embedding = model.encode([user_question], convert_to_tensor=True)
    scores = cosine_similarity(user_embedding, question_embeddings)[0]
    top_idx = np.argsort(scores)[::-1][:top_k]
    return [(df['question'].iloc[i], df['answer'].iloc[i], scores[i]) for i in top_idx]


In [9]:
import re

In [10]:
diseases = ["leukemia", "diabetes", "asthma", "cancer", "flu"]
treatments = ["chemotherapy", "radiation", "immunotherapy", "surgery", "transplant"]
symptoms = ["fever", "cough", "pain", "fatigue", "headache"]


In [11]:
def basic_entity_recognition(text):
    found = []
    for word in diseases:
        if re.search(rf"\b{word}\b", text, re.IGNORECASE):
            found.append((word, "DISEASE"))
    for word in treatments:
        if re.search(rf"\b{word}\b", text, re.IGNORECASE):
            found.append((word, "TREATMENT"))
    for word in symptoms:
        if re.search(rf"\b{word}\b", text, re.IGNORECASE):
            found.append((word, "SYMPTOM"))
    return found

In [12]:
user_q = "How is leukemia treated?"

In [13]:
results = get_best_answer(user_q)
match_q, ans, score = results[0]

In [14]:
print("Matched Question:", match_q)
print("Answer:", ans)
print("Similarity:", round(score, 3))

Matched Question: What is (are) Leukemia ?
Answer: Leukemia is cancer of the white blood cells. White blood cells help your body fight infection. Your blood cells form in your bone marrow. In leukemia, the bone marrow produces abnormal white blood cells. These cells crowd out the healthy blood cells, making it hard for blood to do its work.    There are different types of leukemia, including       - Acute lymphocytic leukemia    - Acute myeloid leukemia    - Chronic lymphocytic leukemia    - Chronic myeloid leukemia       Leukemia can develop quickly or slowly. Chronic leukemia grows slowly. In acute leukemia, the cells are very abnormal and their number increases rapidly. Adults can get either type; children with leukemia most often have an acute type. Some leukemias can often be cured. Other types are hard to cure, but you can often control them. Treatments may include chemotherapy, radiation and stem cell transplantation. Even if symptoms disappear, you might need therapy to prevent

In [15]:
entities = basic_entity_recognition(ans)
print("\nEntities:")
for ent, label in entities:
    print(f"{ent} — {label}")


Entities:
leukemia — DISEASE
cancer — DISEASE
chemotherapy — TREATMENT
radiation — TREATMENT
