In [2]:
import json
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Load JSON entries
with open("dataset.json") as f:
    data = json.load(f)

symptoms_list = [", ".join(entry["Symptoms"]) for entry in data]
diseases = [entry["Disease"] for entry in data]

# Encode disease labels
label_encoder = LabelEncoder()
disease_labels = label_encoder.fit_transform(diseases)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class SymptomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_texts, val_texts, train_labels, val_labels = train_test_split(symptoms_list, disease_labels, test_size=0.2)

train_dataset = SymptomDataset(train_texts, train_labels, tokenizer)
val_dataset = SymptomDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [4]:
from torch.optim import AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop (basic version)
for epoch in range(10):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} completed.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed.
Epoch 2 completed.
Epoch 3 completed.
Epoch 4 completed.
Epoch 5 completed.
Epoch 6 completed.
Epoch 7 completed.
Epoch 8 completed.
Epoch 9 completed.
Epoch 10 completed.


In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

# For retrieval based on symptoms
embedder = SentenceTransformer('all-MiniLM-L6-v2')
symptom_embeddings = embedder.encode(symptoms_list).astype('float32')

# Save full record for retrieval
full_records = data

# Build FAISS index
index = faiss.IndexFlatL2(symptom_embeddings.shape[1])
index.add(symptom_embeddings)

In [6]:
def predict_disease(symptom_input):
    model.eval()
    encoding = tokenizer(symptom_input, return_tensors="pt", truncation=True, padding='max_length', max_length=64)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        pred_label = torch.argmax(outputs.logits, dim=1).item()
    
    predicted_disease = label_encoder.inverse_transform([pred_label])[0]
    return predicted_disease


In [7]:
def retrieve_full_info(symptom_input, k=3):
    input_embedding = embedder.encode([symptom_input]).astype('float32')
    D, I = index.search(input_embedding, k)
    results = [full_records[i] for i in I[0]]
    return results


In [8]:
def full_pipeline(symptom_list):
    input_text = ", ".join(symptom_list)
    
    predicted_disease = predict_disease(input_text)
    retrieved_entries = retrieve_full_info(input_text, k=3)

    # Optionally filter based on predicted disease
    for entry in retrieved_entries:
        if entry["Disease"] == predicted_disease:
            return entry
    
    # Fallback
    return {"Disease": predicted_disease, "entry": entry}


In [9]:
symptoms = ["Runny nose, sore throat, sneezing"]#, "Sore throat", "Cough"]
result = full_pipeline(symptoms)
print(json.dumps(result, indent=2))

{
  "Disease": "Common Cold",
  "Symptoms": [
    "Runny nose",
    "sore throat",
    "sneezing"
  ],
  "Medicines": [
    "Paracetamol",
    "Antihistamines"
  ],
  "Brand Names": [
    "Crocin",
    "Cetirizine"
  ],
  "Dosages": [
    "500mg",
    "10mg"
  ],
  "Prices (INR)": [
    "15",
    "20"
  ]
}


In [30]:
model.save_pretrained("saved_model/bert_disease_classifier")
tokenizer.save_pretrained("saved_model/bert_disease_classifier")


('saved_model/bert_disease_classifier/tokenizer_config.json',
 'saved_model/bert_disease_classifier/special_tokens_map.json',
 'saved_model/bert_disease_classifier/vocab.txt',
 'saved_model/bert_disease_classifier/added_tokens.json')

In [31]:
import faiss

# Ensure `index` is a FAISS object (IndexFlatL2, IndexIVF, etc.)
faiss.write_index(index, "saved_model/faiss_index.index")


In [32]:
import pickle

with open("saved_model/full_records.pkl", "wb") as f:
    pickle.dump(full_records, f)


In [34]:
with open("saved_model/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
