In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# Preview
print(df.shape)
df.head()

(246945, 378)


Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Symptoms are all columns except 'diseases'
symptom_list = list(df.columns[1:])
print(f"Total unique symptoms: {len(symptom_list)}")

Total unique symptoms: 377


In [5]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load pretrained model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create embeddings for each symptom and store in dictionary
symptom_embeddings = {
    symptom: model.encode(symptom, convert_to_tensor=True) for symptom in symptom_list
}

In [6]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_input(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    filtered = [w for w in tokens if w not in stop_words]
    return " ".join(filtered)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def get_top_symptoms(user_input, top_k=10):
    cleaned = clean_input(user_input)
    input_vec = model.encode(cleaned, convert_to_tensor=True)
    
    matches = [
        (symptom, float(util.cos_sim(input_vec, vec)))
        for symptom, vec in symptom_embeddings.items()
    ]
    return sorted(matches, key=lambda x: x[1], reverse=True)[:top_k]

In [10]:
def hybrid_symptom_match(user_input, symptom_list, model, symptom_embeddings, top_k=10):
    input_clean = clean_input(user_input)
    input_vec = model.encode(input_clean, convert_to_tensor=True)

    # Step 1: Semantic matching
    semantic_scores = [
        (sym, float(util.cos_sim(input_vec, vec)))
        for sym, vec in symptom_embeddings.items()
    ]
    top_semantic = sorted(semantic_scores, key=lambda x: x[1], reverse=True)[:top_k]

    # Step 2: Add exact keyword matches (bonus!)
    exact_matches = [sym for sym in symptom_list if sym in input_clean]
    for sym in exact_matches:
        if sym not in [x[0] for x in top_semantic]:
            top_semantic.append((sym, 1.0))  # force-add exact match with high score

    return sorted(top_semantic, key=lambda x: x[1], reverse=True)[:top_k]

In [11]:
print("Hybrid Matches:\n")
for symptom, score in hybrid_symptom_match(test_input, symptom_list, model, symptom_embeddings):
    print(f"→ {symptom}  ({score:.4f})")

Hybrid Matches:

→ fever  (1.0000)
→ shortness of breath  (0.6854)
→ sharp chest pain  (0.6162)
→ burning chest pain  (0.5920)
→ hurts to breath  (0.5681)
→ difficulty breathing  (0.5628)
→ dizziness  (0.5503)
→ congestion in chest  (0.5317)
→ abnormal breathing sounds  (0.5028)
→ irregular heartbeat  (0.4424)


In [14]:
test_input = "I've been feeling feverish and my chest pain. Also a bit dizzy lately and even cough."

print(f"Cleaned input: {clean_input(test_input)}\n")
print("Hybrid Matches:")

matches = hybrid_symptom_match(test_input, symptom_list, model, symptom_embeddings)

for symptom, score in matches:
    print(f"→ {symptom}  ({score:.4f})")

Cleaned input: ive feeling feverish chest pain also bit dizzy lately even cough

Hybrid Matches:
→ cough  (1.0000)
→ fever  (1.0000)
→ sharp chest pain  (0.6467)
→ congestion in chest  (0.5878)
→ burning chest pain  (0.5795)
→ dizziness  (0.5706)
→ shoulder cramps or spasms  (0.5019)
→ feeling ill  (0.4890)
→ irregular heartbeat  (0.4673)
→ feeling hot and cold  (0.4644)


In [15]:
import nltk
import re
from nltk.util import ngrams
from nltk.corpus import stopwords

# Download once
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Your symptom list (can be expanded)
symptom_list = [
    "fever", "chest pain", "dizziness", "headache", "shortness of breath",
    "fatigue", "cough", "sore throat", "nausea", "vomiting"
]

def clean_input(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = [word for word in text.split() if word not in stop_words]
    return tokens

def extract_ngrams(tokens, max_n=3):
    all_ngrams = []
    for n in range(1, max_n + 1):
        ngram_tuples = list(ngrams(tokens, n))
        ngram_phrases = [' '.join(gram) for gram in ngram_tuples]
        all_ngrams.extend(ngram_phrases)
    return all_ngrams

def match_ngrams_to_symptoms(user_input, symptom_list):
    tokens = clean_input(user_input)
    ngram_phrases = extract_ngrams(tokens)
    matched = [sym for sym in symptom_list if sym in ngram_phrases]
    return matched

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
user_input = "I feel feverish with some chest pain and I'm dizzy"
matched = match_ngrams_to_symptoms(user_input, symptom_list)
print("Matched Symptoms:", matched)

Matched Symptoms: ['chest pain']


In [17]:
# STEP 1: Imports
import re
import nltk
import torch
import pickle
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sentence_transformers import SentenceTransformer, util

# Download stopwords if needed
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# STEP 2: Utility functions
def clean_input(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = [word for word in text.split() if word not in stop_words]
    return tokens

def extract_ngrams(tokens, max_n=3):
    all_ngrams = []
    for n in range(1, max_n + 1):
        ngram_tuples = list(ngrams(tokens, n))
        ngram_phrases = [' '.join(gram) for gram in ngram_tuples]
        all_ngrams.extend(ngram_phrases)
    return all_ngrams

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Load full symptom list and their embeddings
with open("../model/model_all_symptoms.pkl", "rb") as f:
    symptom_list = pickle.load(f)

with open("../model/model_symptom_embeddings.pkl", "rb") as f:
    symptom_embeddings = pickle.load(f)

# Load sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [22]:
def hybrid_symptom_match(user_input, top_k=10):
    tokens = clean_input(user_input)
    ngram_phrases = extract_ngrams(tokens)

    # Step 1: Semantic match
    input_vec = model.encode(" ".join(tokens), convert_to_tensor=True).cpu()
    semantic_scores = [
        (sym, float(util.cos_sim(input_vec, emb)))
        for sym, emb in symptom_embeddings.items()
    ]
    top_matches = sorted(semantic_scores, key=lambda x: x[1], reverse=True)[:top_k]

    # Step 2: Exact match from n-grams
    matched_syms = [sym for sym, _ in top_matches]
    for sym in symptom_list:
        if sym in ngram_phrases and sym not in matched_syms:
            top_matches.append((sym, 0.95))  # Add fallback with high score

    return sorted(top_matches, key=lambda x: x[1], reverse=True)[:top_k]

In [24]:
user_input = "I feel feverish, have chest pain and have cough and even feel like nausea or vomiting"

results = hybrid_symptom_match(user_input)

for sym, score in results:
    print(f"→ {sym} ({score:.4f})")

→ cough (0.9500)
→ sharp chest pain (0.6292)
→ burning chest pain (0.6018)
→ nausea (0.5915)
→ congestion in chest (0.5680)
→ vomiting (0.5056)
→ feeling ill (0.4995)
→ shoulder cramps or spasms (0.4873)
→ sore throat (0.4836)
→ throat redness (0.4811)


In [29]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# Get list of unique symptoms
symptom_list = df.columns[1:].tolist()

# Load sentence transformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute symptom embeddings
symptom_embeddings = {sym: model.encode(sym, convert_to_tensor=True) for sym in symptom_list}

# Clean and tokenize
stop_words = set(stopwords.words("english"))
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

# Hybrid matcher with n-gram and semantic similarity
def extract_symptoms_semantic_phrases(user_input, top_k=10):
    tokens = clean_and_tokenize(user_input)
    print(tokens)
    phrases = []
    for n in range(1, 4):
        phrases += [" ".join(gram) for gram in ngrams(tokens, n)]
    
    seen = set()
    scored_matches = []

    for phrase in phrases:
        if phrase in seen:
            continue
        seen.add(phrase)

        input_vec = model.encode(phrase, convert_to_tensor=True)
        scores = [(sym, float(util.cos_sim(input_vec, emb))) for sym, emb in symptom_embeddings.items()]
        top_match = max(scores, key=lambda x: x[1])
        if top_match[1] > 0.6:
            scored_matches.append(top_match)

    scored_matches.sort(key=lambda x: x[1], reverse=True)
    return scored_matches[:top_k]

# Test


[nltk_data] Downloading package punkt to /Users/jeetshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:

test_input = "I feel feverish, have chest pain and have cough and even feel like nausea or vomiting and even shorten breathing"
results = extract_symptoms_semantic_phrases(test_input)
for symptom, score in results:
    print(f"{symptom} → {score:.4f}")

['feel', 'feverish', 'chest', 'pain', 'cough', 'even', 'feel', 'like', 'nausea', 'vomiting', 'even', 'shorten', 'breathing']
vomiting → 1.0000
cough → 1.0000
nausea → 1.0000
sharp chest pain → 0.8958
nausea → 0.8743
fever → 0.8605
nausea → 0.8432
nausea → 0.8271
nausea → 0.8216
vomiting → 0.8212


In [38]:
import pandas as pd
import torch
import re
import nltk
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Load your dataset
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")
symptom_list = df.columns[1:].tolist()

# Step 2: Load transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 3: Precompute symptom embeddings
symptom_embeddings = {sym: model.encode(sym, convert_to_tensor=True) for sym in symptom_list}

# Step 4: Text cleaner
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Step 5: Matching logic with deduplication
def extract_symptoms_semantic_phrases(text, top_k=5):
    tokens = clean_and_tokenize(text)
    
    phrases = []
    for n in range(1, 4):
        phrases += [" ".join(gram) for gram in ngrams(tokens, n)]

    seen = set()
    matches = []

    for phrase in phrases:
        input_vec = model.encode(phrase, convert_to_tensor=True)
        scored = [(sym, float(util.cos_sim(input_vec, emb))) for sym, emb in symptom_embeddings.items()]
        best_symptom, best_score = max(scored, key=lambda x: x[1])

        if best_score > 0.6 and best_symptom not in seen:
            seen.add(best_symptom)
            matches.append((best_symptom, best_score))

    matches.sort(key=lambda x: x[1], reverse=True)
    return matches[:top_k]

[nltk_data] Downloading package punkt to /Users/jeetshah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
test_input = "I feel tired and unable to sleep"
results = extract_symptoms_semantic_phrases(test_input, top_k=10)



→ fatigue (0.7206)
→ insomnia (0.6901)
→ sleepiness (0.6760)
→ feeling hot (0.6316)


In [41]:
import time

test_input = "I feel tired and unable to sleep"

start = time.time()
results = extract_symptoms_semantic_phrases(test_input, top_k=10)
end = time.time()

print(f"\n✅ Time taken: {end - start:.4f} seconds\n")
for symptom, score in results:
    print(f"→ {symptom} ({score:.4f})")


✅ Time taken: 2.0046 seconds

→ fatigue (0.7206)
→ insomnia (0.6901)
→ sleepiness (0.6760)
→ feeling hot (0.6316)
