In [6]:
import spacy

from scispacy.abbreviation import AbbreviationDetector

special_nlp = spacy.load("en_core_sci_sm")


In [None]:
# Add the abbreviation pipe to the spacy pipeline.
nlp.add_pipe("abbreviation_detector")

doc = nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

print("Abbreviation", "\t", "Definition")
for abrv in doc._.abbreviations:
	print(f"{abrv} \t ({abrv.start}, {abrv.end}) {abrv._.long_form}")

In [1]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

special_nlp = spacy.load("en_ner_bc5cdr_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [2]:
special_nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

doc = special_nlp("Spinal and bulbar muscular atrophy (SBMA) is an \
           inherited motor neuron disease caused by the expansion \
           of a polyglutamine tract within the androgen receptor (AR). \
           SBMA can be caused by this easily.")

entity = doc.ents[1]

print("Name: ", entity)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Name:  SBMA


In [36]:
doc = special_nlp('I have been diagnosed with diabetes and hypertension.')
entity = doc.ents
print(entity)

(diabetes, hypertension)


In [37]:
linker = special_nlp.get_pipe("scispacy_linker")
print(entity[0]._.kb_ents)
# for umls_ent in entity._.kb_ents:
# 	print(linker.kb.cui_to_entity[umls_ent[0]])

[('C0011847', 0.9754753708839417), ('C0011849', 0.9754753708839417), ('C0362046', 0.8810040354728699), ('C1263960', 0.8480122685432434), ('C0271650', 0.8144901990890503)]


In [None]:
for cui, score in entity[0]._.kb_ents:
    ent = linker.kb.cui_to_entity[cui]
    print(ent.types,ent.canonical_name, score)
    print('****************')

['T047'] Diabetes 0.9754753708839417
****************
['T047'] Diabetes Mellitus 0.9754753708839417
****************
['T047'] Prediabetes syndrome 0.8810040354728699
****************
['T047'] Diabetes with coma (disorder) 0.8480122685432434
****************
['T047'] Glucose Intolerance (disease) 0.8144901990890503
****************


: 

In [11]:
import json
import pandas as pd

with open('intent_dataset.json', mode = 'rb') as file:
    intents = json.load(file)
data = []
for label, examples in intents.items():
    for ex in examples:
        data.append((ex, label))

df = pd.DataFrame(data, columns=["text", "label"])
df.head()


Unnamed: 0,text,label
0,Why do I feel skin rash?,symptom_check
1,What could cause shortness of breath?,symptom_check
2,I have cough,symptom_check
3,I'm suffering from dizziness,symptom_check
4,Why do I feel nausea?,symptom_check


In [12]:
type(df.label)

pandas.core.series.Series

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_id'] = le.fit_transform(df.label)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].to_list(), df["label_id"].to_list(), test_size=0.2, random_state=42, stratify = df["label_id"]
)

In [14]:
train_texts

['How much paracetamol should I take?',
 'Best way to reduce stress?',
 'What are the side effects of antibiotics?',
 'Which fruits are good for skin glow?',
 'What should I do to maintain focus?',
 'Best diet for skin glow?',
 'Can you suggest ways to do yoga for back pain?',
 'How can I improve posture?',
 'I keep getting skin rash',
 'What should I eat to boost stamina?',
 'Best diet for diabetes?',
 'Why do I feel joint pain?',
 'Tips for improve sleep?',
 'What should I eat to boost stamina?',
 'Is vitamin D supplements safe for fever?',
 'What could cause fever?',
 'Can I use iron tablets for cold?',
 'What are the side effects of paracetamol?',
 'How much ibuprofen should I take?',
 'Can you suggest ways to reduce stress?',
 "I'm suffering from chest pain",
 'How much paracetamol should I take?',
 'How much paracetamol should I take?',
 'Is cough syrup safe for pain relief?',
 'How much iron tablets should I take?',
 'Is nausea serious?',
 'What could cause back pain?',
 'How mu

In [15]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_tokens = tokenizer(train_texts, truncation = True, padding = True)
test_tokens = tokenizer(test_texts, truncation = True, padding = True)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
train_tokens

{'input_ids': [[101, 2129, 2172, 11498, 3401, 15464, 4747, 2323, 1045, 2202, 1029, 102, 0], [101, 2190, 2126, 2000, 5547, 6911, 1029, 102, 0, 0, 0, 0, 0], [101, 2054, 2024, 1996, 2217, 3896, 1997, 24479, 1029, 102, 0, 0, 0], [101, 2029, 10962, 2024, 2204, 2005, 3096, 8652, 1029, 102, 0, 0, 0], [101, 2054, 2323, 1045, 2079, 2000, 5441, 3579, 1029, 102, 0, 0, 0], [101, 2190, 8738, 2005, 3096, 8652, 1029, 102, 0, 0, 0, 0, 0], [101, 2064, 2017, 6592, 3971, 2000, 2079, 13272, 2005, 2067, 3255, 1029, 102], [101, 2129, 2064, 1045, 5335, 16819, 1029, 102, 0, 0, 0, 0, 0], [101, 1045, 2562, 2893, 3096, 23438, 102, 0, 0, 0, 0, 0, 0], [101, 2054, 2323, 1045, 4521, 2000, 12992, 2358, 27651, 1029, 102, 0, 0], [101, 2190, 8738, 2005, 14671, 1029, 102, 0, 0, 0, 0, 0, 0], [101, 2339, 2079, 1045, 2514, 4101, 3255, 1029, 102, 0, 0, 0, 0], [101, 10247, 2005, 5335, 3637, 1029, 102, 0, 0, 0, 0, 0, 0], [101, 2054, 2323, 1045, 4521, 2000, 12992, 2358, 27651, 1029, 102, 0, 0], [101, 2003, 17663, 1040, 25654, 3

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

class IntentData(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        item = {key: torch.tensor(label[index]) for key, label in self.tokens.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item

train_dataset = IntentData(train_tokens, train_labels)
test_dataset = IntentData(test_tokens, test_labels)

In [18]:
import accelerate, transformers, torch
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)

torch: 2.8.0+cpu
transformers: 4.55.4
accelerate: 1.10.1


In [19]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import accelerate

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_))

training_args = TrainingArguments(
    output_dir="./mymodel",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch" 
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.0069,0.00516
2,0.0029,0.002251
3,0.0023,0.001797




TrainOutput(global_step=300, training_loss=0.114067205345879, metrics={'train_runtime': 137.2488, 'train_samples_per_second': 17.486, 'train_steps_per_second': 2.186, 'total_flos': 8072520019200.0, 'train_loss': 0.114067205345879, 'epoch': 3.0})

In [21]:
model.save_pretrained('./mymodel')
tokenizer.save_pretrained('./mymodel')

('./mymodel\\tokenizer_config.json',
 './mymodel\\special_tokens_map.json',
 './mymodel\\vocab.txt',
 './mymodel\\added_tokens.json')

In [22]:
test_sentence = "Can I take the pill tomorrow ?"
inputs = tokenizer(test_sentence, return_tensors="pt", truncation=True, padding=True)

outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.1272, -1.6071,  5.0336, -2.3080]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [23]:
predictions = torch.argmax(outputs['logits'],dim=1)
predictions

tensor([2])

In [24]:
le.inverse_transform(predictions)[0]

'medication_query'

In [25]:
import pandas as pd
df =  pd.read_csv('data/dataset.csv')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [26]:
from collections import defaultdict

symptom_cond_dict_map = defaultdict(list)

for _, row in df.iterrows():
    disease = row['Disease'].strip().lower()
    for col in df.columns[1:]:
        symptom = row[col]
        if pd.notna(symptom):
            symptom_cond_dict_map[symptom.lower()].append(disease)

In [27]:
user_symptoms = ['itching']

In [28]:
from collections import Counter

cond_counter = Counter()
for sym in user_symptoms:
    if sym in symptom_cond_dict_map:
        cond_counter.update(symptom_cond_dict_map[sym])

top_conditions = cond_counter.most_common(3)

In [29]:
top_conditions

[('chronic cholestasis', 114), ('drug reaction', 114), ('jaundice', 114)]

In [30]:
precautions_df = pd.read_csv('data/symptom_precaution.csv')
precautions_df.head()

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths


In [31]:
recommendations = {}

for _, row in precautions_df.iterrows():
    disease = row['Disease'].strip().lower().replace("-", " ").replace("_", " ")
    precautions = [str(row[col]).strip() for col in precautions_df.columns[1:]]
    recommendations[disease] = precautions

In [32]:
def detect_intent(user_text):
    inputs = tokenizer(user_text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return le.inverse_transform([pred])[0]

In [33]:
recommendations.items()

dict_items([('drug reaction', ['stop irritation', 'consult nearest hospital', 'stop taking drug', 'follow up']), ('malaria', ['Consult nearest hospital', 'avoid oily food', 'avoid non veg food', 'keep mosquitos out']), ('allergy', ['apply calamine', 'cover area with bandage', 'nan', 'use ice to compress itching']), ('hypothyroidism', ['reduce stress', 'exercise', 'eat healthy', 'get proper sleep']), ('psoriasis', ['wash hands with warm soapy water', 'stop bleeding using pressure', 'consult doctor', 'salt baths']), ('gerd', ['avoid fatty spicy food', 'avoid lying down after eating', 'maintain healthy weight', 'exercise']), ('chronic cholestasis', ['cold baths', 'anti itch medicine', 'consult doctor', 'eat healthy']), ('hepatitis a', ['Consult nearest hospital', 'wash hands through', 'avoid fatty spicy food', 'medication']), ('osteoarthristis', ['acetaminophen', 'consult nearest hospital', 'follow up', 'salt baths']), ('(vertigo) paroymsal  positional vertigo', ['lie down', 'avoid sudden

In [34]:
def extract_symptoms(user_text):
    doc = special_nlp(user_text)
    extracted = [ent.text.lower() for ent in doc.ents]
    return extracted

In [35]:
def generate_precautions(symptoms, top_k=3):
    cond_counter = Counter()
    
    for sym in symptoms:
        variations = [sym, f" {sym}", sym.strip()]
        found = False
        for variation in variations:
            if variation in symptom_cond_dict_map:
                cond_counter.update(symptom_cond_dict_map[variation])
                found = True
                break
        
        if not found:
            print(f"Symptom '{sym}' not found in dictionary")
    
    top_conditions = cond_counter.most_common(top_k)
    plans = {}
    
    for disease, count in top_conditions:
        disease_key = disease.strip().lower().replace("-", " ").replace("_", " ")
        if disease_key in recommendations:
            plans[disease] = {
                'Matched_symptoms': count,
                'Precautions': recommendations[disease_key]
            }
        else:
            plans[disease] = {
                'Matched_symptoms': count,
                'Precautions': ['Consult a doctor or Search in ChatGPT/Google']
            }
    
    return plans

In [36]:
def wellness_assistant(user_text):
    intent = detect_intent(user_text)
    symptoms = extract_symptoms(user_text) if intent == 'symptom_check' else []
    plan = generate_precautions(symptoms) if symptoms else {}
    return intent, symptoms, plan

In [42]:
user_text = 'when i should take the medicine?'
print(wellness_assistant(user_text))

('medication_query', [], {})


In [40]:
def format_wellness_plan(symptoms, plan):
    if not plan:
        return "No clear recommendations. Please consult a doctor."

    response = f"Based on your reported symptoms ({', '.join(symptoms)}), here's a wellness plan:\n\n"
    for cond, details in plan.items():
        response += f"*Possible condition:* **{cond}** (matched symptoms: {details['Matched_symptoms']})\n"
        response += f"Recommendations:\n"
        for i, precaution in enumerate(details['Precautions'], 1):
            if precaution.lower() != 'nan':
                response += f"  {i}. {precaution}\n"
        response += "\n"
    response += "This is not a medical diagnosis. Please consult a doctor if symptoms persist."
    return response

In [None]:
user_text = "when should I take the medicine if I have headache and fever?"
intent, symptoms, plan = wellness_assistant(user_text)
print("Intent:", intent)
print("Symptoms:", symptoms)
print("\nWellness Plan:\n", format_wellness_plan(symptoms, plan))

🔍 Intent: medication_query
🩺 Symptoms: []

📋 Wellness Plan:
 No clear recommendations. Please consult a doctor.
