In [16]:
import pandas as pd

In [17]:
df=pd.read_csv("/content/Disease-Info SIH Dataset.csv")

In [18]:
df.head(3)

Unnamed: 0,Disease,Cause,Symptoms,Precautions,Treatment,Severity,References
0,Panic disorder,"Genetic predisposition, neurobiologic differen...","Recurrent panic attacks: sudden intense fear, ...",Avoid stimulants; screen for medical causes; e...,"CBT, SSRIs/SNRIs, short-term benzodiazepines, ...","Variable: mild to severe, disabling without tr...",https://www.nimh.nih.gov/health/statistics/pan...
1,Vocal cord polyp,"Vocal fold trauma/overuse, smoking, reflux, ch...","Hoarseness, breathy voice, vocal fatigue, redu...","Voice rest, avoid smoking/irritants, treat ref...","Voice therapy, treat reflux, surgical excision...",Mild–moderate but can impair voice quality,https://www.mayoclinic.org/diseases-conditions...
2,Turner syndrome,"Monosomy X (45,X) or mosaic variants","Short stature, webbed neck, lymphedema, ovaria...","Cardiac monitoring, growth/puberty follow-up, ...","Growth hormone, estrogen replacement, manage a...",Variable: mild to severe depending on anomalies,https://www.nhs.uk/conditions/turner-syndrome;...


In [19]:
df["Disease"]=df["Disease"].str.lower()
df.head(2)

Unnamed: 0,Disease,Cause,Symptoms,Precautions,Treatment,Severity,References
0,panic disorder,"Genetic predisposition, neurobiologic differen...","Recurrent panic attacks: sudden intense fear, ...",Avoid stimulants; screen for medical causes; e...,"CBT, SSRIs/SNRIs, short-term benzodiazepines, ...","Variable: mild to severe, disabling without tr...",https://www.nimh.nih.gov/health/statistics/pan...
1,vocal cord polyp,"Vocal fold trauma/overuse, smoking, reflux, ch...","Hoarseness, breathy voice, vocal fatigue, redu...","Voice rest, avoid smoking/irritants, treat ref...","Voice therapy, treat reflux, surgical excision...",Mild–moderate but can impair voice quality,https://www.mayoclinic.org/diseases-conditions...


# Database

#### Generating Documents

In [20]:
!pip install -q sentence-transformers faiss-cpu tqdm

In [21]:
#Creating documents for Vector Database. Contents of document are embedded and metadata is used for sorting and searching.
from langchain.docstore.document import Document

In [22]:
content_columns=["Cause","Symptoms","Precautions","Treatment"]
docs=[]
for idx, row in df.iterrows():
  for col in content_columns:
    content=str(row[col]).strip()
    doc=Document(
        page_content=content,
        metadata={
            "Disease": row.get("Disease"),
            "field": col,
            "Severity": row.get("Severity",""),
            "References": row.get("References","")
          }
        )
    docs.append(doc)

In [23]:
docs[0:2]

[Document(metadata={'Disease': 'panic disorder', 'field': 'Cause', 'Severity': 'Variable: mild to severe, disabling without treatment', 'References': 'https://www.nimh.nih.gov/health/statistics/panic-disorder; https://my.clevelandclinic.org/health/diseases/4451-panic-disorder'}, page_content='Genetic predisposition, neurobiologic differences, life stressors, comorbid psychiatric/medical conditions'),
 Document(metadata={'Disease': 'panic disorder', 'field': 'Symptoms', 'Severity': 'Variable: mild to severe, disabling without treatment', 'References': 'https://www.nimh.nih.gov/health/statistics/panic-disorder; https://my.clevelandclinic.org/health/diseases/4451-panic-disorder'}, page_content='Recurrent panic attacks: sudden intense fear, palpitations, chest pain, shortness of breath, dizziness, sweating, trembling, derealization')]

In [24]:
len(docs)

2696

#### Embeddings

In [25]:
!pip install langchain-community langchain



In [26]:
from langchain.vectorstores import FAISS #database
from langchain.embeddings import HuggingFaceEmbeddings #word to vector

In [27]:
embedding_model="sentence-transformers/all-MiniLM-L6-v2" #fast and simple. (Switch with qwen's own embeddings for better responses, but heavier and requires gpu usage)
embeddings=HuggingFaceEmbeddings(model_name=embedding_model) #load embeddings from the model

  embeddings=HuggingFaceEmbeddings(model_name=embedding_model) #load embeddings from the model
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [28]:
#create database
vector_db=FAISS.from_documents(docs,embeddings)

In [29]:
query="What are the symptoms, causes and treatments, precautions of Malaria"
result=vector_db.similarity_search(query,k=10)

In [30]:
result[0]

Document(id='c64cf768-d7c6-4e3f-9803-e0bbfadb237a', metadata={'Disease': 'malaria', 'field': 'Cause', 'Severity': 'Mild to severe; can be fatal if untreated', 'References': 'https://www.who.int/news-room/fact-sheets/detail/malaria'}, page_content='Infection by *Plasmodium* parasites transmitted by Anopheles mosquitoes')

In [31]:
data=""
for i in range(len(result)):
  data+=f"""{result[i].metadata}
  field info: {result[i].page_content}\n"""

In [32]:
print(data)

{'Disease': 'malaria', 'field': 'Cause', 'Severity': 'Mild to severe; can be fatal if untreated', 'References': 'https://www.who.int/news-room/fact-sheets/detail/malaria'}
  field info: Infection by *Plasmodium* parasites transmitted by Anopheles mosquitoes
{'Disease': 'dengue fever', 'field': 'Precautions', 'Severity': 'Usually mild; can become severe (hemorrhagic, shock) if untreated', 'References': 'https://www.who.int/news-room/fact-sheets/detail/dengue-and-severe-dengue'}
  field info: Prevent mosquito bites; reduce mosquito breeding; use repellents; early recognition of symptoms
{'Disease': 'parasitic disease', 'field': 'Symptoms', 'Severity': 'Mild to severe depending on parasite', 'References': 'https://www.who.int/news-room/fact-sheets/detail/soil-transmitted-helminth-infections'}
  field info: Fever, diarrhea, weight loss, abdominal pain, anemia depending on parasite
{'Disease': 'pulmonary eosinophilia', 'field': 'Precautions', 'Severity': 'Variable: mild (self‑limiting) to s

In [33]:
len(data)

3032

In [34]:
#search function
def search(query):
  result=vector_db.similarity_search(query,k=3)
  data=""
  for i in range(len(result)):
    data+=f"""{result[i].metadata}
    field info: {result[i].page_content}\n"""
  return data

# Qwen

In [35]:
!pip install transformers accelerate



In [36]:
!pip uninstall -y bitsandbytes
!pip install -U bitsandbytes
#for quantization

# pip uninstall -y bitsandbytes
# pip install --no-cache-dir bitsandbytes==0.43.3


Found existing installation: bitsandbytes 0.47.0
Uninstalling bitsandbytes-0.47.0:
  Successfully uninstalled bitsandbytes-0.47.0
Collecting bitsandbytes
  Using cached bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Using cached bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [37]:
!pip install transformers_stream_generator
#to run qwen's custom code



In [38]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import torch
from transformers import BitsAndBytesConfig

In [39]:
model_name="Qwen/Qwen-7B"
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,   # enable 4-bit
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model=AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)
tokenizer=AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

The repository Qwen/Qwen-7B contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Qwen/Qwen-7B .
 You can inspect the repository content at https://hf.co/Qwen/Qwen-7B.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [40]:
def response(query):
  context=search(query)
  prompt=f"""
You are a healthcare assistant. You will be given trusted medical context about a disease.
Using only the provided context, generate a response in the following strict format:


  "Causes": "bullet point 1", "bullet point 2", "...",
  "Symptoms": "bullet point 1", "bullet point 2", "...",
  "Treatment": "bullet point 1", "bullet point 2", "...",
  "Precautions": "bullet point 1", "bullet point 2", "...",
  "References": "URLs or source IDs"


Rules:
1. Do not invent information. If data is missing in the context, put ["Not available in the database."] for that field.
2. Do not add extra text and formatting.
3. The References field must always include at least one source link or retriever ID if any disease-specific info is provided.
4. Keep each bullet short, clear, and user-friendly.
5. Always end your response with:
    Note: This information is for educational purposes only. Please consult a qualified healthcare professional for medical advice.


Question: {query
           }

context: {context}

Answer:
"""
  inputs=tokenizer(prompt,return_tensors="pt").to(device)
  outputs=model.generate(**inputs,max_new_tokens=256, pad_token_id=tokenizer.eos_token_id,use_cache=False )
  res=tokenizer.decode(outputs[0],skip_special_tokens=True)
  return res.split("Answer:")[1]

In [41]:
# query="Tell me about dengu"
# res=response(query)
# res

In [42]:
# print(res.split("Answer:")[1])

# IndicTrans2

In [43]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
import torch
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [44]:
regional_model_name="ai4bharat/indictrans2-en-indic-dist-200M"
english_model_name="ai4bharat/indictrans2-indic-en-dist-200M"
regional_model=AutoModelForSeq2SeqLM.from_pretrained(regional_model_name,trust_remote_code=True,torch_dtype=torch.float16).to(device)
english_model=AutoModelForSeq2SeqLM.from_pretrained(english_model_name,trust_remote_code=True,torch_dtype=torch.float16).to(device)
regional_tokenizer=AutoTokenizer.from_pretrained(regional_model_name,trust_remote_code=True)
english_tokenizer=AutoTokenizer.from_pretrained(english_model_name,trust_remote_code=True)

`torch_dtype` is deprecated! Use `dtype` instead!


In [45]:
!pip install IndicTransToolkit



In [46]:
from IndicTransToolkit.processor import IndicProcessor

#### Language detection

In [47]:
!pip install fasttext
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

--2025-09-18 13:37:45--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 54.240.184.91, 54.240.184.75, 54.240.184.92, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|54.240.184.91|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin.1’


2025-09-18 13:37:46 (145 MB/s) - ‘lid.176.bin.1’ saved [131266198/131266198]



In [48]:
!pip install "numpy<2.0"



In [49]:
import fasttext
ld_model=fasttext.load_model("lid.176.bin")

In [50]:
lang_map = {
    "as": "asm_Beng",   # Assamese
    "bn": "ben_Beng",   # Bengali
    "bo": "bod_Tibt",   # Bodo (Tibetan script sometimes used)
    "gu": "guj_Gujr",   # Gujarati
    "hi": "hin_Deva",   # Hindi
    "kn": "kan_Knda",   # Kannada
    "ks": "kas_Arab",   # Kashmiri (Arabic script)
    "kok": "kok_Deva",  # Konkani
    "mai": "mai_Deva",  # Maithili
    "ml": "mal_Mlym",   # Malayalam
    "mr": "mar_Deva",   # Marathi
    "ne": "npi_Deva",   # Nepali
    "or": "ory_Orya",   # Odia
    "pa": "pan_Guru",   # Punjabi
    "sa": "san_Deva",   # Sanskrit
    "sat": "sat_Olck",  # Santali (Ol Chiki script)
    "sd": "snd_Arab",   # Sindhi (Arabic script)
    "ta": "tam_Taml",   # Tamil
    "te": "tel_Telu",   # Telugu
    "ur": "urd_Arab",   # Urdu
    "en": "eng_Latn",   # English
}
def detect_language(text):
  labels,_=ld_model.predict(text,k=1)
  lang_code = labels[0].replace("__label__", "")  # e.g., "en", "hi", "bn"
  src_lang=lang_map.get(lang_code,"eng_Latn")
  return src_lang

In [51]:
text="Hii, this english language."
print(detect_language(text))

eng_Latn


#### Translate

In [52]:
def translate_to_english(text):
  ip = IndicProcessor(inference=True)
  src_lang=detect_language(text)
  if src_lang == "eng_Latn":
    return text
  else:
    batch = ip.preprocess_batch([text], src_lang=src_lang, tgt_lang="eng_Latn")
    inputs=english_tokenizer(batch, return_tensors="pt",padding=True, truncation=True).to(device)
    outputs=english_model.generate(**inputs, max_length=256,num_beams=5,use_cache=False)
    generated_tokens=english_tokenizer.batch_decode(outputs,skip_special_tokens=True,clean_up_tokenization_spaces=True)
    trans_text=ip.postprocess_batch(generated_tokens, lang="eng_Latn")
    return trans_text[0]

In [53]:
# text = "नमस्ते! मेरा नाम रोहित है और मैं इंदौर से हूँ। मैं इंजीनियरिंग की पढ़ाई कर रहा हूँ और आर्टिफिशियल इंटेलिजेंस में विशेष रुचि रखता हूँ। मुझे मशीन लर्निंग और डाटा साइंस प्रोजेक्ट्स पर काम करना बहुत पसंद है।"
# print(translate_to_english(text))


In [54]:
def translate_to_regional(text,response):
  ip = IndicProcessor(inference=True)
  tgt_lang=detect_language(text)
  if tgt_lang == "eng_Latn":
    return response
  else:
    batch = ip.preprocess_batch([response], src_lang="eng_Latn", tgt_lang=tgt_lang)
    inputs=regional_tokenizer(batch, return_tensors="pt",padding=True, truncation=True).to(device)
    outputs=regional_model.generate(**inputs, max_length=256,num_beams=5,use_cache=False)
    generated_tokens=regional_tokenizer.batch_decode(outputs,skip_special_tokens=True,clean_up_tokenization_spaces=True)
    trans_text=ip.postprocess_batch(generated_tokens, lang=tgt_lang)
    return trans_text[0]

In [55]:
# res='Hi! My name is Rohit and I am from Indore. I am studying engineering and have a special interest in Artificial Intelligence. I love working on machine learning and data science projects.'
# text = "नमस्ते! मेरा नाम रोहित है और मैं इंदौर से हूँ। मैं इंजीनियरिंग की पढ़ाई कर रहा हूँ और आर्टिफिशियल इंटेलिजेंस में विशेष रुचि रखता हूँ। मुझे मशीन लर्निंग और डाटा साइंस प्रोजेक्ट्स पर काम करना बहुत पसंद है।"
# print(translate_to_regional(text,res))

# FinaL Function

In [56]:
def final(text):
  prompt=translate_to_english(text)
  res = response(prompt)
  result=translate_to_regional(text,res)
  return result

In [57]:
# text="What are the symptoms of malaria?" #english
# print(final(text))

In [58]:
# text="ملیریا کی علامات کیا ہیں؟" #urdu
# print(final(text))

urdu text:
- وجوہات: ملیریا ایک پرجیویوں کی وجہ سے ہوتا ہے جو متاثرہ خاتون اینوفیلز مچھروں کے کاٹنے سے انسانوں میں منتقل ہوتا ہے۔-علامات:-ہلکی صورتوں میں، بخار، سر درد، سردی اور پٹھوں میں درد موجود ہو سکتا ہے۔-شدید صورتوں میں، بخار زیادہ ہو سکتا ہے، جس کی وجہ سے مایوسی، الجھاو، یا کوما ہو سکتا ہے۔-مزید برآں، کچھ لوگوں کو ملیریا کی شدید شکل کا تجربہ ہو سکتا ہے جسے "شدید ملیریا" کہا جاتا ہے، جو سانس کی تکلیف، گردے کی ناکامی اور یہاں تک کہ موت کا باعث بن سکتا ہے۔-شدید ملیریا کی علامات میں شامل ہو سکتے ہیں:-بخار، سردی، کھانسی، سانس لینے میں دشواری، اور سر درد۔-متلی، الٹی، پیٹ میں درد، اور اسہال بھی ہو سکتا ہے۔-شدت: ملیریا شدید ہو سکتا ہے اور خاص طور پر امیونائزڈ افراد میں زندگی کو خطرے میں ڈال سکتا ہے۔-حفاظتی اقدامات میں شامل ہیں۔ لمبی نیند کے کپڑے، نیند کے جال کا استعمال۔

In [59]:
# text="मलेरिया के लक्षण क्या हैं?" #hindi
# print(final(text))

Hindi text:
मलेरिया के लक्षणः 1. बुखारः यह सबसे आम लक्षण है और 40 डिग्री सेल्सियस (104 डिग्री फ़ारेनहाइट) तक भी बहुत अधिक हो सकता है।
2. ठंड लगना और कांपनाः ये तेज बुखार के कारण होता है।
3. सिरदर्दः यह एक आम लक्षण है।
4. मांसपेशियों में दर्द और दर्दः इसे अफ्रीका में'किकविट'कहा जाता है, और यह लाल रक्त कोशिकाओं के विनाश के कारण होता है।
5. थकानः यह एक और सामान्य लक्षण है।
6. मतली और उल्टीः ये हो सकते हैं।
7. दानेः हाथों और पैरों पर एक सपाट, लाल दाने दिखाई दे सकते हैं।
8. पेट दर्दः यह तब हो सकता है जब परजीवी यकृत या प्लीहा को प्रभावित करता है।
9. गंभीर मामलों में, एक रोगी को दौरे पड़ सकते हैं, कई अंगों में दर्द हो सकता है, और 10 कोमा हो सकता है। अन्य लक्षणों में पसीने आना, नींद आने से रोकने के लिए दवाएं शामिल हैं।

In [60]:
# text="ମଲେରିଆର ଲକ୍ଷଣ କ’ଣ କ’ଣ?" #odia
# print(final(text))

Odia text:
ମ୍ଯ଼ାଲେରିଆର ଲକ୍ଷଣଗୁଡ଼ିକ ରୋଗର ତୀବ୍ରତା ଉପରେ ନିର୍ଭର କରେ। ପରଜୀବୀ ଉପରେ ନିର୍ଭର କରି ସାମାନ୍ଯ଼ରୁ ଗୁରୁତରଃ ଜ୍ୱର, ଡାଇରିଆ, ଓଜନ ହ୍ରାସ, ପେଟ ଯନ୍ତ୍ରଣା, ପରଜୀବୀ ଉପରେ ନିର୍ଭର କରି ରକ୍ତହୀନତା ଗୁରୁତର ଏବଂ ଜୀବନ ପ୍ରତି ବିପଦଜନକ ହୋଇପାରେ, ବିଶେଷ କରି ରୋଗ ପ୍ରତିରୋଧକ ଶକ୍ତିରେଃ ଜ୍ୱର, ମୁଣ୍ଡବିନ୍ଧା, କାଶ, ନିଶ୍ୱାସ ପ୍ରଶ୍ୱାସ ନେବାରେ ଅସୁବିଧା, ମେନିଞ୍ଜାଇଟିସ୍ ଲକ୍ଷଣ (ବେକରେ କଠୋରତା, ଦ୍ୱନ୍ଦ୍ୱ) ଟିପ୍ପଣୀଃ ଏହି ସୂଚନା କେବଳ ଶିକ୍ଷାଗତ ଉଦ୍ଦେଶ୍ଯ଼ରେ। ଦଯ଼ାକରି ଡାକ୍ତରୀ ପରାମର୍ଶ ପାଇଁ ଜଣେ ଯୋଗ୍ଯ଼ ସ୍ୱାସ୍ଥ୍ଯ଼ସେବା ବୃତ୍ତିଗତଙ୍କ ପରାମର୍ଶ ନିଅନ୍ତୁ।

# Gradio

In [61]:
import gradio as gr

def gr_func(text, chat_history):
    res = final(text)
    chat_history.append((text, res))
    return "", chat_history

with gr.Blocks(css=".gradio-container {background-color: Grey}") as demo:

    chatbot = gr.Chatbot(label="QwenVeda")

    with gr.Row():
        text_input = gr.Textbox(
            label="Enter Query:",
            placeholder="Ask about any disease or medical condition...",
        )
        clear_btn = gr.ClearButton([text_input, chatbot], value="Clear Chat")

    text_input.submit(
        gr_func,
        inputs=[text_input, chatbot],
        outputs=[text_input, chatbot]
    )
demo.launch(debug=True)

  chatbot = gr.Chatbot(label="QwenVeda")


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://82afcfd44402da1bc1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://82afcfd44402da1bc1.gradio.live


