In [9]:
# ✅ Load all necessary Python libraries

import pandas as pd  # For handling the CSV dataset
from sentence_transformers import SentenceTransformer, util  # For semantic similarity using embeddings
from collections import defaultdict, Counter  # For mapping symptoms to diseases and counting
from operator import itemgetter  # For sorting diseases by match score

In [11]:
# ✅ Load the CSV file you uploaded
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# ✅ Display first few rows to understand structure
df.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# ✅ Extract all symptom column names (everything after the first column)
all_symptoms = df.columns.tolist()[1:]

# ✅ Create Disease → Symptoms map
disease_symptom_map = {}
for _, row in df.iterrows():
    disease = row['diseases']
    symptoms = [symptom for symptom in all_symptoms if row[symptom] == 1]
    disease_symptom_map[disease] = symptoms

# ✅ Create Symptom → Diseases map (reverse lookup)
from collections import defaultdict
symptom_disease_map = defaultdict(list)
for disease, symptoms in disease_symptom_map.items():
    for symptom in symptoms:
        symptom_disease_map[symptom].append(disease)

In [13]:
# ✅ Load the sentence-transformer model for semantic matching
from sentence_transformers import SentenceTransformer, util

# Load the MiniLM model (fast + accurate)
model = SentenceTransformer('all-MiniLM-L6-v2')

# ✅ Encode all symptoms to embeddings (just once)
symptom_embeddings = model.encode(all_symptoms, convert_to_tensor=True)

In [16]:
def extract_symptoms_from_input(user_input, all_symptoms, symptom_embeddings, top_n=5):
    """
    Returns top-N most similar symptoms from user input.
    Uses semantic similarity (cosine distance).
    """
    user_embedding = model.encode(user_input, convert_to_tensor=True)
    cosine_scores = util.cos_sim(user_embedding, symptom_embeddings)[0]

    # Sort all symptoms by similarity score
    matched_sorted = sorted(
        zip(all_symptoms, cosine_scores),
        key=lambda x: float(x[1]),
        reverse=True
    )

    # Return top N symptoms
    return [symptom for symptom, _ in matched_sorted[:top_n]]


In [19]:
extract_symptoms_from_input("I've been dizzy,tight chest and can’t breathe properly", all_symptoms, symptom_embeddings, top_n=10)

['difficulty breathing',
 'chest tightness',
 'congestion in chest',
 'dizziness',
 'sharp chest pain',
 'hurts to breath',
 'throat feels tight',
 'shortness of breath',
 'abnormal breathing sounds',
 'breathing fast']

In [20]:
# ✅ Suggest symptoms that commonly co-occur with confirmed ones
def suggest_related_symptoms(confirmed_symptoms, symptom_disease_map, disease_symptom_map, max_suggestions=6):
    related = Counter()

    for symptom in confirmed_symptoms:
        for disease in symptom_disease_map.get(symptom, []):
            for related_symptom in disease_symptom_map.get(disease, []):
                if related_symptom not in confirmed_symptoms:
                    related[related_symptom] += 1

    # Return top N related symptoms
    return [s for s, _ in related.most_common(max_suggestions)]

In [21]:
confirm = ['fever', 'sore throat']
suggest_related_symptoms(confirm, symptom_disease_map, disease_symptom_map)


['cough',
 'vomiting',
 'nasal congestion',
 'ear pain',
 'headache',
 'sharp abdominal pain']

In [26]:
# ✅ Rank diseases by match percentage with user's confirmed symptoms

def rank_diseases_by_symptoms(input_symptoms, disease_symptom_map, top_n=100):
    score = {}

    for disease, symptoms in disease_symptom_map.items():
        if not symptoms:
            continue
        matched = len(set(input_symptoms) & set(symptoms))
        score[disease] = matched / len(symptoms)

    # Sort by score descending
    ranked = sorted(score.items(), key=itemgetter(1), reverse=True)

    return [(d, round(s * 100, 2)) for d, s in ranked if s > 0][:top_n]

In [27]:
confirmed = ['fever', 'sore throat', 'cough']
rank_diseases_by_symptoms(confirmed, disease_symptom_map)

[('herpangina', 75.0),
 ('teething syndrome', 50.0),
 ('hyperhidrosis', 50.0),
 ('dengue fever', 50.0),
 ('pulmonary fibrosis', 50.0),
 ('otitis media', 50.0),
 ('cystic fibrosis', 50.0),
 ('hemophilia', 50.0),
 ('hyperosmotic hyperketotic state', 50.0),
 ('scarlet fever', 50.0),
 ('typhoid fever', 50.0),
 ("otitis externa (swimmer's ear)", 50.0),
 ('acute bronchitis', 50.0),
 ('acute respiratory distress syndrome (ards)', 50.0),
 ('conjunctivitis due to bacteria', 50.0),
 ('g6pd enzyme deficiency', 50.0),
 ('acute bronchospasm', 42.86),
 ('allergy to animals', 40.0),
 ('croup', 40.0),
 ('tonsillar hypertrophy', 40.0),
 ('lymphadenitis', 37.5),
 ('vocal cord polyp', 33.33),
 ('salivary gland disorder', 33.33),
 ('chronic sinusitis', 33.33),
 ('achalasia', 33.33),
 ('tietze syndrome', 33.33),
 ('conversion disorder', 33.33),
 ('pharyngitis', 33.33),
 ('wilson disease', 33.33),
 ('nose disorder', 33.33),
 ('muscular dystrophy', 33.33),
 ('oral mucosal lesion', 33.33),
 ('hypernatremia', 

In [30]:
disease_symptom_map["eczema"]

['cough',
 'acne or pimples',
 'itching of skin',
 'skin dryness, peeling, scaliness, or roughness',
 'skin irritation']

In [31]:
# Reload dataset (after unzipping if needed)
import pandas as pd

df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# View all rows labeled as "flu"
flu_rows = df[df['diseases'].str.lower() == "flu"]
print(flu_rows.shape)
flu_rows.head()

(679, 378)


Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
187525,flu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187526,flu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187527,flu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187528,flu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
187529,flu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
all_symptoms = df.columns.tolist()[1:]

# Find symptoms marked as 1 in any flu row
flu_symptoms = set()
for _, row in flu_rows.iterrows():
    flu_symptoms.update([sym for sym in all_symptoms if row[sym] == 1])

print(sorted(flu_symptoms))

['ache all over', 'chills', 'coryza', 'cough', 'diarrhea', 'fever', 'headache', 'nausea', 'sore throat', 'vomiting']


In [33]:
# ✅ Group all rows for the same disease to collect every unique symptom

disease_symptom_map = {}

for disease in df['diseases'].unique():
    all_rows = df[df['diseases'] == disease]
    symptoms = set()

    for _, row in all_rows.iterrows():
        present = [sym for sym in all_symptoms if row[sym] == 1]
        symptoms.update(present)

    disease_symptom_map[disease] = list(symptoms)

In [34]:
disease_symptom_map['flu']

['headache',
 'fever',
 'cough',
 'sore throat',
 'diarrhea',
 'ache all over',
 'nausea',
 'coryza',
 'chills',
 'vomiting']