In [1]:
# ✅ Step 1: Import all libraries
import pandas as pd
import torch
import re
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util

In [2]:

# ✅ Step 2: Download NLTK assets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jeetshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# ✅ Step 3: Load dataset
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")
symptom_list = df.columns[1:].tolist()

In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
symptom_embeddings = {
    symptom: model.encode(symptom, convert_to_tensor=True)
    for symptom in symptom_list
}

In [6]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

In [7]:
def extract_symptoms_fast(text, top_k=10):
    tokens = clean_and_tokenize(text)
    phrases = [" ".join(gram) for n in range(1, 4) for gram in ngrams(tokens, n)]

    # Encode all phrases in batch
    input_vecs = model.encode(phrases, convert_to_tensor=True)

    # Stack all symptom vectors
    symptom_names = list(symptom_embeddings.keys())
    symptom_matrix = torch.stack([symptom_embeddings[s] for s in symptom_names])

    # Compute cosine similarity
    sim_matrix = util.cos_sim(input_vecs, symptom_matrix)

    # Get top matches
    best_indices = torch.topk(sim_matrix.flatten(), top_k).indices

    seen = set()
    results = []
    for idx in best_indices:
        row, col = divmod(idx.item(), sim_matrix.shape[1])
        symptom = symptom_names[col]
        score = float(sim_matrix[row, col])
        if symptom not in seen:
            seen.add(symptom)
            results.append((symptom, score))

    return results

In [8]:
user_input = "I feel feverish, have chest pain and cough and even nausea or vomiting"

matches = extract_symptoms_fast(user_input)

for sym, score in matches:
    print(f"→ {sym} ({score:.4f})")

→ nausea (1.0000)
→ cough (1.0000)
→ vomiting (1.0000)
→ sharp chest pain (0.8958)
→ burning chest pain (0.8704)
→ fever (0.8605)


In [9]:
import time

user_input = "I feel tired and unable to sleep and bad cough"

start = time.time()
matches = extract_symptoms_fast(user_input)
end = time.time()

print(f"\n✅ Time taken: {end - start:.4f} seconds\n")
for sym, score in matches:
    print(f"→ {sym} ({score:.4f})")


✅ Time taken: 0.1203 seconds

→ cough (1.0000)
→ sleepiness (0.7388)
→ fatigue (0.7206)
→ insomnia (0.6901)


In [10]:
import pickle
import os

# Create the directory if it doesn't exist
os.makedirs("../model", exist_ok=True)

# Save embeddings dictionary
with open("../model/symptom_embeddings.pkl", "wb") as f:
    pickle.dump(symptom_embeddings, f)

# Save symptom list (optional, but good to have)
with open("../model/symptom_list.pkl", "wb") as f:
    pickle.dump(symptom_list, f)

print("✅ Saved symptom embeddings and list successfully.")

✅ Saved symptom embeddings and list successfully.
