In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pickle

# Step 1: Load dataset
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# Step 2: Filter out diseases that occur only once
disease_counts = df["diseases"].value_counts()
valid_diseases = disease_counts[disease_counts >= 2].index
df = df[df["diseases"].isin(valid_diseases)].copy()

# Step 3: Encode labels
label_encoder = LabelEncoder()
df["diseases_encoded"] = label_encoder.fit_transform(df["diseases"])

# Step 4: Split data
X = df.drop(columns=["diseases", "diseases_encoded"])
y = df["diseases_encoded"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# Step 5: Train Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = dt_model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Classification Report:\n", classification_report(y_test, y_pred))

# Step 7: Save the model and label encoder
with open("decision_tree_model.pkl", "wb") as f:
    pickle.dump(dt_model, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("🎉 Model and label encoder saved!")

✅ Accuracy: 0.8162839671161868
✅ Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98        28
           1       0.86      0.96      0.91        81
           2       0.67      0.91      0.77        58
           3       0.67      1.00      0.80         4
           4       0.74      0.84      0.79        68
           5       0.67      1.00      0.80         6
           6       0.86      0.86      0.86         7
           7       0.50      0.88      0.64        17
           8       0.53      0.81      0.64        99
           9       0.59      0.76      0.67       182
          10       0.85      0.92      0.88       241
          11       0.61      0.77      0.68       243
          12       0.63      0.57      0.60       181
          13       0.67      1.00      0.80         2
          14       0.25      0.43      0.32        42
          15       0.93      0.91      0.92       182
          16       0.71 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🎉 Model and label encoder saved!


In [16]:
# Load model and encoder
with open("decision_tree_model.pkl", "rb") as f:
    model = pickle.load(f)

with open("label_encoder.pkl", "rb") as f:
    encoder = pickle.load(f)

# Input vector from confirmed symptoms
input_symptoms = ['cough', 'fever','headache']
input_vector = np.zeros((1, len(symptom_columns)))
for sym in input_symptoms:
    if sym in symptom_columns:
        input_vector[0, symptom_columns.index(sym)] = 1

# Get probabilities for all classes
probs = model.predict_proba(input_vector)[0]

# Top N diseases
top_n = 5
top_indices = np.argsort(probs)[::-1][:top_n]
top_diseases = [(encoder.inverse_transform([i])[0], round(probs[i], 4)) for i in top_indices]

# Show result
for disease, prob in top_diseases:
    print(f"🩺 {disease} → {prob}")

🩺 aphthous ulcer → 0.5714
🩺 mononucleosis → 0.1429
🩺 meningitis → 0.1429
🩺 acute bronchitis → 0.1429
🩺 female infertility of unknown cause → 0.0




In [26]:
import pickle

# Load stored models
with open("../model/symptom_confidence_scores.pkl", "rb") as f:
    symptom_confidence = pickle.load(f)

with open("../model/disease_frequency.pkl", "rb") as f:
    disease_frequency = pickle.load(f)

# Scoring function
def rank_diseases(symptoms, top_n=5):
    results = []

    for disease, sym_scores in symptom_confidence.items():
        matched = [s for s in symptoms if s in sym_scores]
        if not matched:
            continue
        
        confidence = sum(sym_scores[s] for s in matched)
        overlap = len(matched) / len(symptoms)
        prevalence = disease_frequency.get(disease, 0)

        # Weighted score (adjust weights as needed)
        score = 0.6 * confidence + 0.2 * overlap + 0.3 * prevalence

        results.append({
            "disease": disease,
            "score": round(score, 4),
            "confidence": round(confidence, 4),
            "overlap": round(overlap, 4),
            "prevalence": round(prevalence, 4)
        })

    return sorted(results, key=lambda x: x["score"], reverse=True)[:top_n]

# Example test
confirmed_symptoms = ["cold", "cough", "sore throat"]
top_diseases = rank_diseases(confirmed_symptoms)

# Display results
for d in top_diseases:
    print(f"🦠 {d['disease'].title()}")
    print(f"   → Score: {d['score']}, Confidence: {d['confidence']}, Overlap: {d['overlap']}, Prevalence: {d['prevalence']}")

🦠 Allergy To Animals
   → Score: 1.1947, Confidence: 1.769, Overlap: 0.6667, Prevalence: 0
🦠 Salivary Gland Disorder
   → Score: 1.0915, Confidence: 1.597, Overlap: 0.6667, Prevalence: 0
🦠 Conjunctivitis Due To Bacteria
   → Score: 1.0177, Confidence: 1.474, Overlap: 0.6667, Prevalence: 0
🦠 Whooping Cough
   → Score: 0.9985, Confidence: 1.442, Overlap: 0.6667, Prevalence: 0
🦠 Mononucleosis
   → Score: 0.9385, Confidence: 1.342, Overlap: 0.6667, Prevalence: 0
