In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
file_path='/content/drive/MyDrive/symptoms_specialists.csv'
df=pd.read_csv(file_path)
print(df.head())

   Code                         Name  \
0     1               Panic disorder   
1     2             Vocal cord polyp   
2     3              Turner syndrome   
3     4               Cryptorchidism   
4     5  Ethylene glycol poisoning-1   

                                            Symptoms  \
0  Palpitations, Sweating, Trembling, Shortness o...   
1           Hoarseness, Vocal Changes, Vocal Fatigue   
2  Short stature, Gonadal dysgenesis, Webbed neck...   
3  Absence or undescended testicle(s), empty scro...   
4  Nausea, vomiting, abdominal pain, General mala...   

                                          Treatments      Specialist  
0  Antidepressant medications, Cognitive Behavior...  a psychiatrist  
1       Voice Rest, Speech Therapy, Surgical Removal     audiologist  
2  Growth hormone therapy, Estrogen replacement t...    gynecologist  
3  Observation and monitoring (in cases of mild o...       urologist  
4  Supportive Measures, Gastric Decontamination, ...              a

In [4]:
import spacy
spacy.prefer_gpu()

True

In [5]:
import random
from spacy.training.example import Example
from spacy.util import minibatch
from spacy.training import Example

def convert_to_spacy_format(df):
    data = []
    for i, row in df.iterrows():
        text = row["Symptoms"]
        spans = []
        start = 0
        for sym in text.split(","):
            sym = sym.strip()
            if sym:
                start = text.find(sym, start)
                spans.append((start, start + len(sym)))
                start += len(sym)
        entities = [(start, end, "SYMPTOM") for start, end in spans]
        data.append((text, {"entities": entities}))
    return data

TRAIN_DATA = convert_to_spacy_format(df)

In [6]:
print(TRAIN_DATA)

[('Palpitations, Sweating, Trembling, Shortness of breath, Fear of losing control, Dizziness', {'entities': [(0, 12, 'SYMPTOM'), (14, 22, 'SYMPTOM'), (24, 33, 'SYMPTOM'), (35, 54, 'SYMPTOM'), (56, 78, 'SYMPTOM'), (80, 89, 'SYMPTOM')]}), ('Hoarseness, Vocal Changes, Vocal Fatigue', {'entities': [(0, 10, 'SYMPTOM'), (12, 25, 'SYMPTOM'), (27, 40, 'SYMPTOM')]}), ('Short stature, Gonadal dysgenesis, Webbed neck, Lymphedema', {'entities': [(0, 13, 'SYMPTOM'), (15, 33, 'SYMPTOM'), (35, 46, 'SYMPTOM'), (48, 58, 'SYMPTOM')]}), ('Absence or undescended testicle(s), empty scrotum, smaller or underdeveloped testicle(s), inguinal hernia, abnormal positioning of the testicle(s) (higher in the groin area)', {'entities': [(0, 34, 'SYMPTOM'), (36, 49, 'SYMPTOM'), (51, 88, 'SYMPTOM'), (90, 105, 'SYMPTOM'), (107, 173, 'SYMPTOM')]}), ('Nausea, vomiting, abdominal pain, General malaise, weakness, Increased thirst, frequent urination', {'entities': [(0, 6, 'SYMPTOM'), (8, 16, 'SYMPTOM'), (18, 32, 'SYMPTOM')

In [7]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
ner.add_label("SYMPTOM")

optimizer = nlp.begin_training()
for itn in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    batches = minibatch(TRAIN_DATA, size=2)
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], losses=losses)
    print(f"Iteration {itn}, Losses: {losses}")

spacy_model_path = "trained_spacy_ner"
nlp.to_disk(spacy_model_path)
print(f"Saved spaCy NER model at {spacy_model_path}")

Iteration 0, Losses: {'ner': np.float32(567.2257)}
Iteration 1, Losses: {'ner': np.float32(15.504209)}
Iteration 2, Losses: {'ner': np.float32(1.1450126)}
Iteration 3, Losses: {'ner': np.float32(8.5726856e-07)}
Iteration 4, Losses: {'ner': np.float32(1.6207263e-08)}
Iteration 5, Losses: {'ner': np.float32(4.9025886e-07)}
Iteration 6, Losses: {'ner': np.float32(5.1719984e-09)}
Iteration 7, Losses: {'ner': np.float32(4.073431e-09)}
Iteration 8, Losses: {'ner': np.float32(8.067036e-09)}
Iteration 9, Losses: {'ner': np.float32(2.4665543e-09)}
Saved spaCy NER model at trained_spacy_ner


In [8]:
import spacy

spacy_model_path = "trained_spacy_ner"
nlp = spacy.load(spacy_model_path)

text = "The patient complained of Palpitations, Sweating, and Dizziness during the night."

doc = nlp(text)

symptoms = [ent.text for ent in doc.ents if ent.label_ == "SYMPTOM"]

print("Extracted Symptoms:", symptoms)


Extracted Symptoms: ['The patient complained of Palpitations', 'Sweating', 'and Dizziness during the night.']


In [10]:
!pip install transformers --upgrade



In [16]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)

cuda


In [28]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.optim import AdamW

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

specialists = df['Specialist'].unique()
treatments = df['Treatments'].unique()
specialist2id = {s: i for i, s in enumerate(specialists)}
treatment2id = {t: i for i, t in enumerate(treatments)}

df['specialist_id'] = df['Specialist'].map(specialist2id)
df['treatment_id'] = df['Treatments'].map(treatment2id)

class SymptomDataset(Dataset):
    def __init__(self, texts, specialist_labels, treatment_labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, return_tensors="pt")
        self.specialist_labels = torch.tensor(specialist_labels.tolist())
        self.treatment_labels = torch.tensor(treatment_labels.tolist())

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['specialist_labels'] = self.specialist_labels[idx]
        item['treatment_labels'] = self.treatment_labels[idx]
        return item

    def __len__(self):
        return len(self.specialist_labels)

train_texts, val_texts, train_s_labels, val_s_labels, train_t_labels, val_t_labels = train_test_split(
    df['Symptoms'], df['specialist_id'], df['treatment_id'], test_size=0.1, random_state=42)

train_dataset = SymptomDataset(train_texts, train_s_labels, train_t_labels)
val_dataset = SymptomDataset(val_texts, val_s_labels, val_t_labels)

model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT", num_labels=len(specialists)
)

optimizer = AdamW(model.parameters(), lr=5e-5)

print(device)
model.train()
loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
for epoch in range(10):
    for batch in loader:
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['specialist_labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(device)
    print(f"Epoch {epoch} Loss: {loss.item()}")

bert_model_path = "trained_clinicalbert_specialist"
model.save_pretrained(bert_model_path)
tokenizer.save_pretrained(bert_model_path)


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--emilyalsentzer--Bio_ClinicalBERT/snapshots/d5892b39a4adaed74b92212a44081509db72f87b/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--emilyalsentzer--Bio_ClinicalBERT/snapshots/d5892b39a4adaed74b92212a44081509db72f87b/vocab.txt
loading file to

cuda


Safetensors PR exists


cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
Epoch 0 Loss: 3.6547634601593018
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
Epoch 1 Loss: 2.772618532180786
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
Epoch 2 Loss: 3.2563283443450928
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
cuda
Ep

Configuration saved in trained_clinicalbert_specialist/config.json


cuda
Epoch 9 Loss: 0.14230680465698242


Model weights saved in trained_clinicalbert_specialist/model.safetensors
tokenizer config file saved in trained_clinicalbert_specialist/tokenizer_config.json
Special tokens file saved in trained_clinicalbert_specialist/special_tokens_map.json


('trained_clinicalbert_specialist/tokenizer_config.json',
 'trained_clinicalbert_specialist/special_tokens_map.json',
 'trained_clinicalbert_specialist/vocab.txt',
 'trained_clinicalbert_specialist/added_tokens.json',
 'trained_clinicalbert_specialist/tokenizer.json')

In [32]:
import pandas as pd
import random

samples = [
    {
        "Symptoms": "Palpitations, sweating, trembling, shortness of breath",
        "Specialist": "a psychiatrist",
        "Treatments": "Antidepressants, Cognitive Behavioral Therapy"
    },
    {
        "Symptoms": "Joint pain, stiffness, swelling in the morning",
        "Specialist": "rheumatologist",
        "Treatments": "NSAIDs, DMARDs, physical therapy"
    },
    {
        "Symptoms": "Severe abdominal pain, blood in urine, nausea",
        "Specialist": "urologist",
        "Treatments": "Pain relief, fluid intake, lithotripsy"
    },
    {
        "Symptoms": "Hoarseness, vocal fatigue, vocal changes",
        "Specialist": "audiologist",
        "Treatments": "Voice rest, speech therapy, surgery"
    },
    {
        "Symptoms": "Frequent urination, thirst, weight loss",
        "Specialist": "endocrinologist",
        "Treatments": "Insulin therapy, diet, lifestyle changes"
    },
    {
        "Symptoms": "Webbed neck, short stature, gonadal dysgenesis",
        "Specialist": "gynecologist",
        "Treatments": "Growth hormone, estrogen therapy"
    },
    {
        "Symptoms": "Back pain, fragile bones, stooped posture",
        "Specialist": "orthopedic",
        "Treatments": "Bisphosphonates, calcium supplements"
    },
    {
        "Symptoms": "Fever, sore throat, body ache",
        "Specialist": "general physician",
        "Treatments": "Antipyretics, rest, hydration"
    },
    {
        "Symptoms": "Skin rash, itching, redness",
        "Specialist": "dermatologist",
        "Treatments": "Topical creams, antihistamines"
    },
    {
        "Symptoms": "Seizures, confusion, memory loss",
        "Specialist": "neurologist",
        "Treatments": "Anti-epileptic drugs, MRI scans"
    }
]
test_data = [random.choice(samples) for _ in range(100)]
test_df = pd.DataFrame(test_data)

In [37]:
correct = 0
for text, true_specialist in zip(test_df["Symptoms"], test_df["Specialist"]):
    result = predict_specialist_and_treatment(text)
    if result["Predicted Specialist"] == true_specialist:
        correct += 1
accuracy = correct / len(test_df)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 91.00%


In [47]:
nlp = spacy.load("trained_spacy_ner")
model = AutoModelForSequenceClassification.from_pretrained(bert_model_path)
tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
model.eval()

id2specialist = {v: k for k, v in specialist2id.items()}
id2treatment = {v: k for k, v in treatment2id.items()}

def predict_specialist_and_treatment(text):

    doc = nlp(text)
    symptoms = [ent.text for ent in doc.ents if ent.label_ == "SYMPTOM"]
    if not symptoms:
        symptoms = [text]

    symptoms_text = ", ".join(symptoms)

    inputs = tokenizer(symptoms_text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        output = model(**inputs)

    pred_id = torch.argmax(output.logits, dim=1).item()
    predicted_specialist = id2specialist[pred_id]

    predicted_treatment = df[df["Specialist"] == predicted_specialist]["Treatments"].iloc[0]

    return {
        "Extracted Symptoms": symptoms,
        "Predicted Specialist": predicted_specialist,
        "Suggested Treatment": predicted_treatment
    }


result = predict_specialist_and_treatment("I have severe pain in urine area.")
print(result)


loading configuration file trained_clinicalbert_specialist/config.json
Model config BertConfig {
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_3

{'Extracted Symptoms': ['I have severe pain in urine area.'], 'Predicted Specialist': 'a doctor', 'Suggested Treatment': 'Blood tests, Supportive Measures, Gastric Decontamination, Antidote Administration, Hemodialysis'}


In [49]:
nlp.to_disk("spacy_ner_model")

model.save_pretrained("clinicalbert_model")
tokenizer.save_pretrained("clinicalbert_model")


Configuration saved in clinicalbert_model/config.json
Model weights saved in clinicalbert_model/model.safetensors
tokenizer config file saved in clinicalbert_model/tokenizer_config.json
Special tokens file saved in clinicalbert_model/special_tokens_map.json


('clinicalbert_model/tokenizer_config.json',
 'clinicalbert_model/special_tokens_map.json',
 'clinicalbert_model/vocab.txt',
 'clinicalbert_model/added_tokens.json',
 'clinicalbert_model/tokenizer.json')

In [50]:
import shutil

shutil.make_archive("hybrid_model", 'zip', root_dir=".", base_dir="spacy_ner_model")
shutil.make_archive("clinicalbert_model", 'zip', root_dir=".", base_dir="clinicalbert_model")


'/content/clinicalbert_model.zip'

In [53]:
import os
os.listdir('/content')

['.config',
 'hybrid_model.zip',
 'clinicalbert_model',
 'trained_spacy_ner',
 'clinicalbert_model.zip',
 'drive',
 'trained_clinicalbert_specialist',
 'spacy_ner_model',
 'sample_data']

In [54]:
from google.colab import files

files.download("hybrid_model.zip")

files.download("clinicalbert_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>