In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("data/symptoms.csv")

# Show the first 5 rows
df.head()

Unnamed: 0,Disease,Common disease group,Symptoms,Disease Group,Dosha Types
0,"Urethritis (infectious, noninfectious)",Urinary tract infections,Burning urination; pain during urination (dysu...,Lower urinary tract infection (UTI),Vata|Pitta|Kapha
1,Cystitis (acute bacterial cystitis),Urinary tract infections,Burning urination; frequent urination; urgency...,Lower UTI (bladder infection),Pitta|Vata|Kapha
2,Interstitial cystitis (nonbacterial),Urinary tract infections,Chronic pelvic pain; bladder pressure; frequen...,Nonbacterial bladder inflammation,Vata|Pitta
3,Pyelonephritis (acute bacterial),Urinary tract infections,High fever; chills; flank pain; nausea; vomiti...,Upper UTI (kidney infection),Pitta|Vata|Kapha
4,Chronic pyelonephritis,Urinary tract infections,Vague flank/abdominal pain; malaise; recurrent...,Chronic upper UTI,Pitta|Vata|Kapha


In [2]:
df["Disease Group"].value_counts()

Disease Group
Muscular dystrophy                                       16
Congenital heart disease                                 14
Infectious disease                                       10
Congenital myopathy                                      10
Cancer (Head and neck)                                   10
                                                         ..
Bacterial UTI (lower or upper)                            1
Bacterial UTI (lower/upper)                               1
female reproductive disorder (cyclical hormonal)          1
female reproductive disorder (hormonal mood disorder)     1
Simple lower UTI                                          1
Name: count, Length: 958, dtype: int64

In [3]:
df["Common disease group"].value_counts()

Common disease group
Urinary tract infections                 100
Muscular disorders                       100
Cardiomyopathies                         100
zoonotic diseases                        100
Liver disease                            100
Hematological diseases                   100
Cardiovascular diseases                  100
Cancer and neoplasms                     100
Eye diseases                             100
Ear diseases                             100
Mental health / Psychiatric disorders    100
Endocrine and Metabolic Diseases         100
Nutritional Deficiency Diseases          100
Reproductive system diseases             100
Tropical diseases                         84
tropical diseases                         16
Name: count, dtype: int64

In [4]:
df["Disease"].value_counts()

Disease
Urethritis (infectious, noninfectious)                                    1
Cystitis (acute bacterial cystitis)                                       1
Interstitial cystitis (nonbacterial)                                      1
Pyelonephritis (acute bacterial)                                          1
Chronic pyelonephritis                                                    1
                                                                         ..
Interstitial cystitis (bladder pain syndrome with reproductive impact)    1
Fitz-Hugh–Curtis syndrome                                                 1
Hematosalpinx                                                             1
Prostatic intraepithelial neoplasia (PIN)                                 1
Female genital tuberculosis (FGTB)                                        1
Name: count, Length: 1500, dtype: int64

In [5]:
df["Dosha Types"].value_counts()

Dosha Types
vata                446
pitta               325
vata|pitta           83
vata;pitta           81
kapha                76
Pitta                58
vata;kapha           55
vata|kapha           52
pitta;vata           47
pitta;kapha          30
kapha;vata           29
Kapha                25
pitta|vata|kapha     23
vata|pitta|kapha     17
kapha|vata           16
pitta|vata           13
Pitta;Kapha          13
kapha;pitta          12
Pitta|Vata           11
Vata|Pitta|Kapha     11
vata;pitta;kapha     10
Pitta|Vata|Kapha     10
Pitta|Kapha           7
vata|kapha|pitta      7
pitta|kapha|vata      6
kapha|pitta|vata      6
pitta;vata;kapha      5
Vata|Pitta            4
Pitta|Kapha|Vata      4
vata;kapha;pitta      4
pitta;kapha;vata      3
Kapha;Pitta           3
Kapha|Pitta|Vata      3
kapha|pitta           1
pitta|kapha           1
kapha|vata|pitta      1
Vata;Pitta            1
kapha;pitta;vata      1
Name: count, dtype: int64

In [6]:
import re
import pandas as pd

# Load your CSV
df = pd.read_csv("data/symptoms.csv")

# Canonical order we want to enforce in 2-dosha combos
ORDER = ["vata", "pitta", "kapha"]
ORDER_SET = set(ORDER)

def normalize_dosha(x):
    if pd.isna(x):
        return pd.NA

    s = str(x).strip().lower()

    # If it's any tridosha wording, normalize straight away
    # (covers 'tridosha', 'trisosha', 'tri dosha', etc.)
    if "tridosha" in s or "trisosha" in s or re.search(r"\btri\s*dosha\b", s):
        return "tridosha"

    # Replace separators ; , / + with '|', drop spaces, and strip odd chars
    s = re.sub(r"[;,+/]+", "|", s)     # unify to '|'
    s = s.replace(" ", "")
    s = re.sub(r"[^a-z|]", "", s)      # keep only letters and '|'

    # Split to parts, remove empties, keep only known doshas
    parts = [p for p in s.split("|") if p]
    parts = [p for p in parts if p in ORDER_SET]

    if not parts:
        return pd.NA

    # Deduplicate and enforce canonical order vata -> pitta -> kapha
    present = [d for d in ORDER if d in parts]

    # Any 3 doshas -> 'tridosha'
    if len(present) >= 3:
        return "tridosha"

    # 1 or 2 doshas -> join with '|'
    return "|".join(present)

# Apply normalization
df["Dosha_Clean"] = df["Dosha Types"].apply(normalize_dosha)

# Inspect results
print("Original unique dosha combos (sample 20):")
print(df["Dosha Types"].unique()[:20])

print("\nNormalized unique dosha combos:")
print(df["Dosha_Clean"].unique())

print("\nCounts after normalization:")
print(df["Dosha_Clean"].value_counts(dropna=False))

# (Optional) Check rows that became NA after cleaning (unexpected formats)
print("\nRows with unrecognized/empty dosha after cleaning:")
print(df[df["Dosha_Clean"].isna()].head())


Original unique dosha combos (sample 20):
['Vata|Pitta|Kapha' 'Pitta|Vata|Kapha' 'Vata|Pitta' 'Pitta|Kapha'
 'Pitta|Kapha|Vata' 'Pitta|Vata' 'Kapha|Pitta|Vata' 'kapha|pitta|vata'
 'pitta|kapha|vata' 'vata|pitta|kapha' 'pitta|vata|kapha' 'vata|pitta'
 'kapha|pitta' 'kapha|vata|pitta' 'pitta|vata' 'kapha|vata' 'vata'
 'vata|kapha' 'vata|kapha|pitta' 'kapha']

Normalized unique dosha combos:
['tridosha' 'vata|pitta' 'pitta|kapha' 'vata|kapha' 'vata' 'kapha' 'pitta']

Counts after normalization:
Dosha_Clean
vata           446
pitta          383
vata|pitta     240
vata|kapha     152
tridosha       111
kapha          101
pitta|kapha     67
Name: count, dtype: int64

Rows with unrecognized/empty dosha after cleaning:
Empty DataFrame
Columns: [Disease, Common disease group, Symptoms, Disease Group, Dosha Types, Dosha_Clean]
Index: []


In [7]:
# Step 1: Load and Explore Data

import pandas as pd

# Load the CSV file (update the path if needed)
df = pd.read_csv("data/symptoms.csv")

# Show the first 5 rows (to get a quick look at the data)
print(df.head())

# Show the number of rows and columns
print("Shape of data:", df.shape)

# Show column names
print("Columns:", df.columns.tolist())

# Check for missing values
print("\nMissing values in each column:\n", df.isnull().sum())

                                  Disease      Common disease group  \
0  Urethritis (infectious, noninfectious)  Urinary tract infections   
1     Cystitis (acute bacterial cystitis)  Urinary tract infections   
2    Interstitial cystitis (nonbacterial)  Urinary tract infections   
3        Pyelonephritis (acute bacterial)  Urinary tract infections   
4                  Chronic pyelonephritis  Urinary tract infections   

                                            Symptoms  \
0  Burning urination; pain during urination (dysu...   
1  Burning urination; frequent urination; urgency...   
2  Chronic pelvic pain; bladder pressure; frequen...   
3  High fever; chills; flank pain; nausea; vomiti...   
4  Vague flank/abdominal pain; malaise; recurrent...   

                         Disease Group       Dosha Types  
0  Lower urinary tract infection (UTI)  Vata|Pitta|Kapha  
1        Lower UTI (bladder infection)  Pitta|Vata|Kapha  
2    Nonbacterial bladder inflammation        Vata|Pitta  


In [8]:
# Step 2: Convert symptoms (text) into numbers using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Select the text column (symptoms)
X = df["Symptoms"]

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform the symptoms
X_vectorized = vectorizer.fit_transform(X)

# Show the shape of the resulting matrix
print("TF-IDF matrix shape:", X_vectorized.shape)

TF-IDF matrix shape: (1500, 2717)


In [9]:
# Step 3: Training and evaluating different models

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Features (X) = Symptoms
X = df["Symptoms"]

# Labels (y) = Disease group (English name) for now
y = df["Common disease group"]

# Define the models we want to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "Decision Tree": DecisionTreeClassifier()
}

# Train and evaluate each model
for name, clf in models.items():
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),  # Step 1: Convert text to numbers
        ("clf", clf)                   # Step 2: Train model
    ])
    pipe.fit(X, y)                      # Train
    y_pred = pipe.predict(X)            # Predict on same data
    acc = accuracy_score(y, y_pred)     # Check accuracy
    print(f"{name}: {acc:.2%}")

Logistic Regression: 95.00%
Random Forest: 100.00%
Decision Tree: 100.00%


In [13]:
import re
import pandas as pd

# Load CSV
df = pd.read_csv("data/symptoms.csv")

# Canonical order
ORDER = ["vata", "pitta", "kapha"]
ORDER_SET = set(ORDER)

def normalize_dosha(x):
    if pd.isna(x):
        return pd.NA

    s = str(x).strip().lower()

    if "tridosha" in s or "trisosha" in s or re.search(r"\btri\s*dosha\b", s):
        return "tridosha"

    s = re.sub(r"[;,+/]+", "|", s)
    s = s.replace(" ", "")
    s = re.sub(r"[^a-z|]", "", s)

    parts = [p for p in s.split("|") if p]
    parts = [p for p in parts if p in ORDER_SET]

    if not parts:
        return pd.NA

    present = [d for d in ORDER if d in parts]
    if len(present) >= 3:
        return "tridosha"
    return "|".join(present)

# 🔹 Create cleaned column
df["Dosha_Clean"] = df["Dosha Types"].apply(normalize_dosha)

# ✅ Now train
X = df["Symptoms"]
y = df["Dosha_Clean"]


In [14]:
import re
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# -----------------------------
# 1) Load data
# -----------------------------
df = pd.read_csv("data/symptoms.csv")

# -----------------------------
# 2) Ensure Dosha_Clean exists
# -----------------------------
ORDER = ["vata", "pitta", "kapha"]
ORDER_SET = set(ORDER)

def normalize_dosha(x):
    if pd.isna(x):
        return pd.NA
    s = str(x).strip().lower()

    # tridosha variations
    if "tridosha" in s or "trisosha" in s or re.search(r"\btri\s*dosha\b", s):
        return "tridosha"

    # unify separators and strip non-letters/pipes
    s = re.sub(r"[;,+/]+", "|", s)
    s = s.replace(" ", "")
    s = re.sub(r"[^a-z|]", "", s)

    parts = [p for p in s.split("|") if p]
    parts = [p for p in parts if p in ORDER_SET]
    if not parts:
        return pd.NA

    present = [d for d in ORDER if d in parts]
    if len(present) >= 3:
        return "tridosha"
    return "|".join(present)

if "Dosha_Clean" not in df.columns:
    df["Dosha_Clean"] = df["Dosha Types"].apply(normalize_dosha)

# Drop rows with missing/empty labels or symptoms
df = df[~df["Dosha_Clean"].isna()].copy()
df = df[df["Symptoms"].astype(str).str.strip().ne("")].copy()

# -----------------------------
# 3) Features & labels
# -----------------------------
X = df["Symptoms"]
y = df["Dosha_Clean"]

# -----------------------------
# 4) Models to test
# -----------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
}

# -----------------------------
# 5) TRAIN accuracy (fit & predict on full data)
#    — matches your earlier evaluation style
# -----------------------------
print("=== Train (fit-on-all) accuracy ===")
for name, clf in models.items():
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("clf", clf)
    ])
    pipe.fit(X, y)
    y_pred = pipe.predict(X)
    acc = accuracy_score(y, y_pred)
    print(f"{name}: {acc:.2%}")

# -----------------------------
# 6) Optional: Hold-out evaluation (recommended)
# -----------------------------
do_holdout = True
if do_holdout:
    print("\n=== Hold-out (train/test split) accuracy ===")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    for name, clf in models.items():
        pipe = Pipeline([
            ("tfidf", TfidfVectorizer()),
            ("clf", clf)
        ])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{name}: {acc:.2%}")
        # Per-class metrics (comment out if too verbose)
        print(classification_report(y_test, y_pred))


=== Train (fit-on-all) accuracy ===
Logistic Regression: 80.87%
Random Forest: 99.60%
Decision Tree: 99.60%

=== Hold-out (train/test split) accuracy ===
Logistic Regression: 62.00%
              precision    recall  f1-score   support

       kapha       1.00      0.30      0.46        20
       pitta       0.63      0.74      0.68        77
 pitta|kapha       1.00      0.08      0.14        13
    tridosha       0.83      0.68      0.75        22
        vata       0.59      0.89      0.71        89
  vata|kapha       0.61      0.35      0.45        31
  vata|pitta       0.52      0.35      0.42        48

    accuracy                           0.62       300
   macro avg       0.74      0.49      0.52       300
weighted avg       0.65      0.62      0.59       300

Random Forest: 64.33%
              precision    recall  f1-score   support

       kapha       0.83      0.25      0.38        20
       pitta       0.64      0.79      0.71        77
 pitta|kapha       1.00      0.15   

In [17]:
# ---------- English Group Model ----------
import numpy as np
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Load your dataset
df = pd.read_csv("data/symptoms.csv")   # adjust path to your CSV

X_en = df["Symptoms"].astype(str)
y_en = df["Common disease group"]

pipeline_en = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        lowercase=True,
        ngram_range=(1, 2),
        max_features=50_000,
        dtype=np.float32
    )),
    ("clf", LogisticRegression(
        solver="saga",
        max_iter=300,
        C=2.0,
        n_jobs=-1
    ))
])

pipeline_en.fit(X_en, y_en)
joblib.dump(pipeline_en, "disease_prediction_model_en_group.pkl", compress=3)
print("Saved: disease_prediction_model_en_group.pkl")

Saved: disease_prediction_model_en_group.pkl


In [None]:
# ---------- Disease Model ----------
import numpy as np
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

df = pd.read_csv("data/symptoms.csv")

X_disease = df["Symptom"].astype(str)
y_disease = df["Disease"]

pipeline_disease = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        lowercase=True,
        ngram_range=(1, 2),
        max_features=50_000,
        dtype=np.float32
    )),
    ("clf", LogisticRegression(
        solver="saga",
        max_iter=300,
        C=2.0,
        n_jobs=-1
    ))
])

pipeline_disease.fit(X_disease, y_disease)
joblib.dump(pipeline_disease, "disease_prediction_model_disease.pkl", compress=3)
print("Saved: ")


In [19]:
# ---------- Dosha Model (robust) ----------
import re
import joblib
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# -----------------------------
# 1) Load data
# -----------------------------
df = pd.read_csv("data/symptoms.csv")
df.rename(columns=lambda c: str(c).strip(), inplace=True)  # guard against stray spaces

# -----------------------------
# 2) Ensure Dosha_Clean exists
# -----------------------------
ORDER = ["vata", "pitta", "kapha"]
ORDER_SET = set(ORDER)

def normalize_dosha(x):
    if pd.isna(x):
        return pd.NA
    s = str(x).strip().lower()

    # handle tridosha variants
    if "tridosha" in s or "trisosha" in s or re.search(r"\btri\s*dosha\b", s):
        return "tridosha"

    # unify separators to '|', remove spaces, keep only letters and '|'
    s = re.sub(r"[;,+/]+", "|", s)
    s = s.replace(" ", "")
    s = re.sub(r"[^a-z|]", "", s)

    parts = [p for p in s.split("|") if p]
    parts = [p for p in parts if p in ORDER_SET]
    if not parts:
        return pd.NA

    present = [d for d in ORDER if d in parts]
    if len(present) >= 3:
        return "tridosha"
    return "|".join(present)

# create Dosha_Clean if missing
if "Dosha_Clean" not in df.columns:
    source_col = "Dosha Types" if "Dosha Types" in df.columns else None
    if source_col is None:
        raise KeyError("Neither 'Dosha_Clean' nor 'Dosha Types' column found in the CSV.")
    df["Dosha_Clean"] = df[source_col].apply(normalize_dosha)

# -----------------------------
# 3) Clean rows (no empty symptoms/labels)
# -----------------------------
df["Symptoms"] = df["Symptoms"].astype(str).fillna("").str.strip()
df = df[(df["Symptoms"] != "") & (~df["Dosha_Clean"].isna())].copy()

# Optional: peek at class balance
print("Dosha class distribution:\n", df["Dosha_Clean"].value_counts())

# -----------------------------
# 4) Features & labels
# -----------------------------
X_dosha = df["Symptoms"]
y_dosha = df["Dosha_Clean"]

# -----------------------------
# 5) Pipeline & training
# -----------------------------
pipeline_dosha = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        lowercase=True,
        ngram_range=(1, 2),
        max_features=50_000,
        dtype=np.float32
    )),
    ("clf", LogisticRegression(
        solver="saga",
        max_iter=2000,          # give it room to converge
        C=2.0,
        class_weight="balanced",# helpful if classes are imbalanced
        multi_class="auto",
        # NOTE: n_jobs is only used by 'liblinear'; 'saga' ignores it.
    ))
])

pipeline_dosha.fit(X_dosha, y_dosha)

# -----------------------------
# 6) Save model
# -----------------------------
out_path = "dosha_classification_model.pkl"
joblib.dump(pipeline_dosha, out_path, compress=3)
print(f"Saved: {out_path}")


Dosha class distribution:
 Dosha_Clean
vata           446
pitta          383
vata|pitta     240
vata|kapha     152
tridosha       111
kapha          101
pitta|kapha     67
Name: count, dtype: int64




Saved: dosha_classification_model.pkl


In [20]:
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [21]:
import ipywidgets as widgets
widgets.IntSlider()

IntSlider(value=0)

In [24]:
# Predict Disease Name from entered symptoms (with suggestions + fuzzy fallback)
import pandas as pd, difflib
import ipywidgets as widgets
from IPython.display import display, Markdown

# Load data
df = pd.read_csv("data/symptoms.csv")
df["Symptoms"] = df["Symptoms"].astype(str)

# Build unique symptom list and lookup
SYMPTOMS = sorted(set(df["Symptoms"].dropna().astype(str).tolist()))
SYMPTOM_LOOKUP = {s.strip().lower(): s for s in SYMPTOMS}

# --- Suggestion helper (prefix + substring + fuzzy) ---
def get_suggestions(query, k=12):
    q = (query or "").strip().lower()
    if not q:
        return []
    prefix = [s for s in SYMPTOMS if s.lower().startswith(q)]
    substr = [s for s in SYMPTOMS if q in s.lower() and s not in prefix]
    fuzzy  = difflib.get_close_matches(q, SYMPTOMS, n=k*2, cutoff=0.6)
    fuzzy  = [s for s in fuzzy if s not in prefix and s not in substr]
    out, seen = [], set()
    for s in prefix + substr + fuzzy:
        if s not in seen:
            out.append(s); seen.add(s)
        if len(out) >= k:
            break
    return out

def resolve_exact(text):
    return SYMPTOM_LOOKUP.get((text or "").strip().lower())

# --- Widgets ---
inp = widgets.Text(
    placeholder="Type symptoms (e.g., 'burning urination; frequency; urgency')",
    description="Symptoms:",
    layout=widgets.Layout(width="80%")
)

sugg_list = widgets.Select(
    options=[],
    rows=6,
    layout=widgets.Layout(width="80%")
)

pred_btn = widgets.Button(
    description="Predict Disease",
    button_style="success",
    layout=widgets.Layout(width="200px", height="50px")
)

out = widgets.Output()

# --- Events ---
def on_text_change(change):
    suggs = get_suggestions(change["new"], k=12)
    sugg_list.options = suggs

def on_select_change(change):
    if change["new"]:
        inp.value = change["new"]

def on_predict(_):
    out.clear_output()
    with out:
        user_text = (inp.value or "").strip()
        if not user_text:
            display(Markdown("> ⚠️ Please enter symptoms (or select from the suggestions)."))
            return

        # 1) Try exact match to a known 'Symptoms' string
        canon = resolve_exact(user_text)

        # 2) If no exact match, fuzzy-match to closest known symptom text
        matched_note = ""
        if not canon:
            best = difflib.get_close_matches(user_text, SYMPTOMS, n=1, cutoff=0.55)
            if best:
                canon = best[0]
                matched_note = f"\n> *No exact match found. Using closest known symptoms:* `{canon}`"
            else:
                display(Markdown("> ❌ No similar symptoms found in the CSV. Try typing differently or pick from suggestions."))
                return

        # Fetch all diseases for the matched symptom text (usually one)
        diseases = sorted(set(df.loc[df["Symptoms"] == canon, "Disease"].astype(str)))
        if not diseases:
            display(Markdown(f"> ❌ Couldn’t find a disease for symptoms: `{canon}`"))
            return

        # Render result
        disease_list_md = "\n".join([f"- `{d}`" for d in diseases])
        display(Markdown(
f"""### 🧾 Predicted Disease
**Entered symptoms:** `{user_text}`{matched_note}

**Result(s):**
{disease_list_md}
"""
        ))

# Wire up
inp.observe(on_text_change, names="value")
sugg_list.observe(on_select_change, names="value")
pred_btn.on_click(on_predict)

# Initial render
display(inp, sugg_list, pred_btn, out)


Text(value='', description='Symptoms:', layout=Layout(width='80%'), placeholder="Type symptoms (e.g., 'burning…

Select(layout=Layout(width='80%'), options=(), rows=6, value=None)

Button(button_style='success', description='Predict Disease', layout=Layout(height='50px', width='200px'), sty…

Output()

In [25]:
# Predict Common disease group from entered symptoms (with suggestions + fuzzy fallback)
import pandas as pd, difflib
import ipywidgets as widgets
from IPython.display import display, Markdown

# Load data
df = pd.read_csv("data/symptoms.csv")
df["Symptoms"] = df["Symptoms"].astype(str)
df["Common disease group"] = df["Common disease group"].astype(str)

# Build unique symptom list and lookup
SYMPTOMS = sorted(set(df["Symptoms"].dropna().tolist()))
SYMPTOM_LOOKUP = {s.strip().lower(): s for s in SYMPTOMS}

# --- Suggestion helper (prefix + substring + fuzzy) ---
def get_suggestions(query, k=12):
    q = (query or "").strip().lower()
    if not q:
        return []
    prefix = [s for s in SYMPTOMS if s.lower().startswith(q)]
    substr = [s for s in SYMPTOMS if q in s.lower() and s not in prefix]
    fuzzy  = difflib.get_close_matches(q, SYMPTOMS, n=k*2, cutoff=0.6)
    fuzzy  = [s for s in fuzzy if s not in prefix and s not in substr]
    out, seen = [], set()
    for s in prefix + substr + fuzzy:
        if s not in seen:
            out.append(s); seen.add(s)
        if len(out) >= k:
            break
    return out

def resolve_exact(text):
    return SYMPTOM_LOOKUP.get((text or "").strip().lower())

# --- Widgets ---
inp = widgets.Text(
    placeholder="Type symptoms (e.g., 'burning urination; frequency; urgency')",
    description="Symptoms:",
    layout=widgets.Layout(width="80%")
)

sugg_list = widgets.Select(
    options=[],
    rows=6,
    layout=widgets.Layout(width="80%")
)

pred_btn = widgets.Button(
    description="Predict Common Group",
    button_style="info",
    layout=widgets.Layout(width="220px", height="50px")
)

out = widgets.Output()

# --- Events ---
def on_text_change(change):
    suggs = get_suggestions(change["new"], k=12)
    sugg_list.options = suggs

def on_select_change(change):
    if change["new"]:
        inp.value = change["new"]

def on_predict(_):
    out.clear_output()
    with out:
        user_text = (inp.value or "").strip()
        if not user_text:
            display(Markdown("> ⚠️ Please enter symptoms (or select from the suggestions)."))
            return

        # 1) Try exact match
        canon = resolve_exact(user_text)

        # 2) Fallback: fuzzy match
        matched_note = ""
        if not canon:
            best = difflib.get_close_matches(user_text, SYMPTOMS, n=1, cutoff=0.55)
            if best:
                canon = best[0]
                matched_note = f"\n> *No exact match found. Using closest known symptoms:* `{canon}`"
            else:
                display(Markdown("> ❌ No similar symptoms found in the CSV. Try typing differently or pick from suggestions."))
                return

        # Fetch all common disease groups for the matched symptom text
        groups = sorted(set(df.loc[df["Symptoms"] == canon, "Common disease group"].astype(str)))
        if not groups:
            display(Markdown(f"> ❌ Couldn’t find a common disease group for symptoms: `{canon}`"))
            return

        groups_md = "\n".join([f"- `{g}`" for g in groups])
        display(Markdown(
f"""### 🧾 Predicted Common Disease Group
**Entered symptoms:** `{user_text}`{matched_note}

**Result(s):**
{groups_md}
"""
        ))

# Wire up
inp.observe(on_text_change, names="value")
sugg_list.observe(on_select_change, names="value")
pred_btn.on_click(on_predict)

# Initial render
display(inp, sugg_list, pred_btn, out)


Text(value='', description='Symptoms:', layout=Layout(width='80%'), placeholder="Type symptoms (e.g., 'burning…

Select(layout=Layout(width='80%'), options=(), rows=6, value=None)

Button(button_style='info', description='Predict Common Group', layout=Layout(height='50px', width='220px'), s…

Output()

In [26]:
# Predict Disease Group from entered symptoms (with suggestions + fuzzy fallback)
import pandas as pd, difflib
import ipywidgets as widgets
from IPython.display import display, Markdown

# Load data
df = pd.read_csv("data/symptoms.csv")
df["Symptoms"] = df["Symptoms"].astype(str)
df["Disease Group"] = df["Disease Group"].astype(str)

# Build unique symptom list and lookup
SYMPTOMS = sorted(set(df["Symptoms"].dropna().tolist()))
SYMPTOM_LOOKUP = {s.strip().lower(): s for s in SYMPTOMS}

# --- Suggestion helper (prefix + substring + fuzzy) ---
def get_suggestions(query, k=12):
    q = (query or "").strip().lower()
    if not q:
        return []
    prefix = [s for s in SYMPTOMS if s.lower().startswith(q)]
    substr = [s for s in SYMPTOMS if q in s.lower() and s not in prefix]
    fuzzy  = difflib.get_close_matches(q, SYMPTOMS, n=k*2, cutoff=0.6)
    fuzzy  = [s for s in fuzzy if s not in prefix and s not in substr]
    out, seen = [], set()
    for s in prefix + substr + fuzzy:
        if s not in seen:
            out.append(s); seen.add(s)
        if len(out) >= k:
            break
    return out

def resolve_exact(text):
    return SYMPTOM_LOOKUP.get((text or "").strip().lower())

# --- Widgets ---
inp = widgets.Text(
    placeholder="Type symptoms (e.g., 'burning urination; frequency; urgency')",
    description="Symptoms:",
    layout=widgets.Layout(width="80%")
)

sugg_list = widgets.Select(
    options=[],
    rows=6,
    layout=widgets.Layout(width="80%")
)

pred_btn = widgets.Button(
    description="Predict Disease Group",
    button_style="",  # neutral style to differentiate from others
    layout=widgets.Layout(width="220px", height="50px")
)

out = widgets.Output()

# --- Events ---
def on_text_change(change):
    suggs = get_suggestions(change["new"], k=12)
    sugg_list.options = suggs

def on_select_change(change):
    if change["new"]:
        inp.value = change["new"]

def on_predict(_):
    out.clear_output()
    with out:
        user_text = (inp.value or "").strip()
        if not user_text:
            display(Markdown("> ⚠️ Please enter symptoms (or select from the suggestions)."))
            return

        # 1) Try exact match
        canon = resolve_exact(user_text)

        # 2) Fallback: fuzzy match
        matched_note = ""
        if not canon:
            best = difflib.get_close_matches(user_text, SYMPTOMS, n=1, cutoff=0.55)
            if best:
                canon = best[0]
                matched_note = f"\n> *No exact match found. Using closest known symptoms:* `{canon}`"
            else:
                display(Markdown("> ❌ No similar symptoms found in the CSV. Try typing differently or pick from suggestions."))
                return

        # Fetch all disease groups for the matched symptom text
        groups = sorted(set(df.loc[df["Symptoms"] == canon, "Disease Group"].astype(str)))
        if not groups:
            display(Markdown(f"> ❌ Couldn’t find a disease group for symptoms: `{canon}`"))
            return

        groups_md = "\n".join([f"- `{g}`" for g in groups])
        display(Markdown(
f"""### 🧾 Predicted Disease Group
**Entered symptoms:** `{user_text}`{matched_note}

**Result(s):**
{groups_md}
"""
        ))

# Wire up
inp.observe(on_text_change, names="value")
sugg_list.observe(on_select_change, names="value")
pred_btn.on_click(on_predict)

# Initial render
display(inp, sugg_list, pred_btn, out)


Text(value='', description='Symptoms:', layout=Layout(width='80%'), placeholder="Type symptoms (e.g., 'burning…

Select(layout=Layout(width='80%'), options=(), rows=6, value=None)

Button(description='Predict Disease Group', layout=Layout(height='50px', width='220px'), style=ButtonStyle())

Output()

In [27]:
# Predict Dosha Type from entered symptoms (with suggestions + fuzzy fallback)
import re, difflib
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, Markdown

# ------------- Load data -------------
df = pd.read_csv("data/symptoms.csv")
df["Symptoms"] = df["Symptoms"].astype(str)

# ------------- Ensure Dosha_Clean exists -------------
ORDER = ["vata", "pitta", "kapha"]
ORDER_SET = set(ORDER)

def normalize_dosha(x):
    if pd.isna(x):
        return pd.NA
    s = str(x).strip().lower()

    # Normalize "tridosha" variants
    if "tridosha" in s or "trisosha" in s or re.search(r"\btri\s*dosha\b", s):
        return "tridosha"

    # Unify separators -> '|' and strip junk
    s = re.sub(r"[;,+/]+", "|", s)
    s = s.replace(" ", "")
    s = re.sub(r"[^a-z|]", "", s)

    parts = [p for p in s.split("|") if p]
    parts = [p for p in parts if p in ORDER_SET]
    if not parts:
        return pd.NA

    present = [d for d in ORDER if d in parts]
    if len(present) >= 3:
        return "tridosha"
    return "|".join(present)

if "Dosha_Clean" not in df.columns:
    if "Dosha Types" not in df.columns:
        raise KeyError("CSV must include 'Dosha Types' or precomputed 'Dosha_Clean'.")
    df["Dosha_Clean"] = df["Dosha Types"].apply(normalize_dosha)

# ------------- Suggestions setup -------------
SYMPTOMS = sorted(set(df["Symptoms"].dropna().tolist()))
SYMPTOM_LOOKUP = {s.strip().lower(): s for s in SYMPTOMS}

def get_suggestions(query, k=12):
    q = (query or "").strip().lower()
    if not q:
        return []
    prefix = [s for s in SYMPTOMS if s.lower().startswith(q)]
    substr = [s for s in SYMPTOMS if q in s.lower() and s not in prefix]
    fuzzy  = difflib.get_close_matches(q, SYMPTOMS, n=k*2, cutoff=0.6)
    fuzzy  = [s for s in fuzzy if s not in prefix and s not in substr]
    out, seen = [], set()
    for s in prefix + substr + fuzzy:
        if s not in seen:
            out.append(s); seen.add(s)
        if len(out) >= k:
            break
    return out

def resolve_exact(text):
    return SYMPTOM_LOOKUP.get((text or "").strip().lower())

# ------------- Widgets -------------
inp = widgets.Text(
    placeholder="Type symptoms (e.g., 'burning urination; frequency; urgency')",
    description="Symptoms:",
    layout=widgets.Layout(width="80%")
)

sugg_list = widgets.Select(
    options=[],
    rows=6,
    layout=widgets.Layout(width="80%")
)

pred_btn = widgets.Button(
    description="Predict Dosha Type",
    button_style="warning",
    layout=widgets.Layout(width="220px", height="50px")
)

out = widgets.Output()

# ------------- Events -------------
def on_text_change(change):
    suggs = get_suggestions(change["new"], k=12)
    sugg_list.options = suggs

def on_select_change(change):
    if change["new"]:
        inp.value = change["new"]

def on_predict(_):
    out.clear_output()
    with out:
        user_text = (inp.value or "").strip()
        if not user_text:
            display(Markdown("> ⚠️ Please enter symptoms (or select from the suggestions)."))
            return

        # Exact first, then fuzzy
        canon = resolve_exact(user_text)
        matched_note = ""
        if not canon:
            best = difflib.get_close_matches(user_text, SYMPTOMS, n=1, cutoff=0.55)
            if best:
                canon = best[0]
                matched_note = f"\n> *No exact match found. Using closest known symptoms:* `{canon}`"
            else:
                display(Markdown("> ❌ No similar symptoms found in the CSV. Try typing differently or pick from suggestions."))
                return

        # Collect dosha types (show cleaned + optionally original if present)
        sub = df.loc[df["Symptoms"] == canon, :]
        dosha_clean = sorted(set(sub["Dosha_Clean"].dropna().astype(str)))
        dosha_orig  = []
        if "Dosha Types" in sub.columns:
            dosha_orig = sorted(set(sub["Dosha Types"].dropna().astype(str)))

        if not dosha_clean and not dosha_orig:
            display(Markdown(f"> ❌ Couldn’t find a dosha type for symptoms: `{canon}`"))
            return

        # Render
        lines = []
        if dosha_clean:
            lines.append("**Dosha (cleaned):** " + ", ".join(f"`{d}`" for d in dosha_clean))
        if dosha_orig:
            lines.append("**Dosha (original):** " + ", ".join(f"`{d}`" for d in dosha_orig))

        display(Markdown(
f"""### 🧾 Predicted Dosha Type
**Entered symptoms:** `{user_text}`{matched_note}

{'<br>'.join(lines)}
"""
        ))

# Wire up
inp.observe(on_text_change, names="value")
sugg_list.observe(on_select_change, names="value")
pred_btn.on_click(on_predict)

# Initial render
display(inp, sugg_list, pred_btn, out)


Text(value='', description='Symptoms:', layout=Layout(width='80%'), placeholder="Type symptoms (e.g., 'burning…

Select(layout=Layout(width='80%'), options=(), rows=6, value=None)



Output()

In [28]:
# Suggestions (clickable list) + Predict full risk (Common disease group + Disease Group + cleaned dosha)
import re, difflib
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, Markdown

# ---------------------------
# Load data
# ---------------------------
df = pd.read_csv("data/symptoms.csv")
df.rename(columns=lambda c: str(c).strip(), inplace=True)

SYMPTOM_COL = "Symptom" if "Symptom" in df.columns else "Symptoms"
if SYMPTOM_COL not in df.columns:
    raise KeyError("Expected 'Symptom' or 'Symptoms' column in CSV.")

# ---------------------------
# Dosha normalization
# ---------------------------
ORDER = ["vata", "pitta", "kapha"]
ORDER_SET = set(ORDER)

def normalize_dosha(x):
    if pd.isna(x):
        return pd.NA
    s = str(x).strip().lower()

    if "tridosha" in s or "trisosha" in s or re.search(r"\btri\s*dosha\b", s):
        return "tridosha"

    s = re.sub(r"[;,+/]+", "|", s)
    s = s.replace(" ", "")
    s = re.sub(r"[^a-z|]", "", s)

    parts = [p for p in s.split("|") if p]
    parts = [p for p in parts if p in ORDER_SET]
    if not parts:
        return pd.NA

    present = [d for d in ORDER if d in parts]
    if len(present) >= 3:
        return "tridosha"
    return "|".join(present)

if "Dosha_Clean" not in df.columns:
    if "Dosha Types" not in df.columns and "Dosha types" not in df.columns:
        raise KeyError("Expected 'Dosha_Clean' or 'Dosha Types' column in CSV.")
    dosha_col = "Dosha Types" if "Dosha Types" in df.columns else "Dosha types"
    df["Dosha_Clean"] = df[dosha_col].apply(normalize_dosha)

# ---------------------------
# Weights
# ---------------------------
disease_group_weight = {
    "Urinary tract infections":              4,
    "Muscular disorders":                    5,
    "Cardiomyopathies":                      7,
    "Cardiovascular diseases":               9,
    "Ear diseases":                          3,
    "Eye diseases":                          4,
    "Hematological diseases":                6,
    "Liver disease":                         7,
    "Mental health / Psychiatric disorders": 6,
    "Nutritional Deficiency Diseases":       4,
    "Reproductive system diseases":          5,
    "Tropical diseases":                     6,
    "Endocrine and Metabolic Diseases":      7,
    "Cancer and neoplasms":                  9,
    "Zoonotic diseases":                     6,
}

dosha_weight = {
    "vata":        7.5,
    "pitta":       8.0,
    "kapha":       6.5,
    "vata|pitta":  8.5,
    "vata|kapha":  7.0,
    "pitta|kapha": 8.0,
    "tridosha":    9.5,
}

W_GROUP = 0.6
W_DOSHA = 0.4

# ---------------------------
# Helpers
# ---------------------------
SYMPTOMS = sorted(set(df[SYMPTOM_COL].dropna().astype(str).tolist()))
SYMPTOM_LOOKUP = {s.strip().lower(): s for s in SYMPTOMS}

def resolve_exact(text: str):
    return SYMPTOM_LOOKUP.get((text or "").strip().lower())

def get_suggestions(query, k=12):
    q = (query or "").strip().lower()
    if not q:
        return []
    prefix = [s for s in SYMPTOMS if s.lower().startswith(q)]
    substr = [s for s in SYMPTOMS if q in s.lower() and s not in prefix]
    fuzzy  = difflib.get_close_matches(q, SYMPTOMS, n=k*2, cutoff=0.6)
    fuzzy  = [s for s in fuzzy if s not in prefix and s not in substr]
    out, seen = [], set()
    for s in prefix + substr + fuzzy:
        if s not in seen:
            out.append(s); seen.add(s)
        if len(out) >= k:
            break
    return out

def risk_level_from_score(score: float):
    if score < 4:
        return "🟢 Low Risk"
    elif score < 7:
        return "🟠 Medium Risk"
    else:
        return "🔴 High Risk"

def compute_risk_for_symptom(symptom_text: str, w_group=W_GROUP, w_dosha=W_DOSHA):
    canon = resolve_exact(symptom_text)
    if not canon:
        return {"found": False, "message": "Symptom not found."}

    row = df.loc[df[SYMPTOM_COL] == canon].iloc[0]
    common_group = str(row["Common disease group"])
    disease_group = str(row["Disease Group"])
    dosha = str(row["Dosha_Clean"])

    g_w = float(disease_group_weight.get(common_group, 0.0))
    d_w = float(dosha_weight.get(dosha, 0.0))

    score = round(w_group * g_w + w_dosha * d_w, 2)
    level = risk_level_from_score(score)

    return {
        "found": True,
        "symptom": canon,
        "common_group": common_group,
        "disease_group": disease_group,
        "dosha": dosha,
        "group_weight": g_w,
        "dosha_weight": d_w,
        "formula": f"Risk = {w_group}×{g_w} + {w_dosha}×{d_w}",
        "risk_score_0_10": score,
        "risk_level": level
    }

# ---------------------------
# Widgets
# ---------------------------
inp = widgets.Text(
    placeholder="Type or pick a symptom…",
    description="Symptom:",
    layout=widgets.Layout(width="80%")
)
sugg_list = widgets.Select(options=[], rows=6, layout=widgets.Layout(width="80%"))

pred_btn = widgets.Button(
    description="Predict Risk",
    button_style="success",
    layout=widgets.Layout(width="220px", height="55px")
)
pred_btn.style.button_color = "#4CAF50"

out = widgets.Output()

# ---------------------------
# Wire up
# ---------------------------
def on_text_change(change):
    sugg_list.options = get_suggestions(change["new"], k=12)

def on_select_change(change):
    if change["new"]:
        inp.value = change["new"]

def on_predict(_):
    out.clear_output()
    with out:
        res = compute_risk_for_symptom(inp.value)
        if not res.get("found"):
            display(Markdown("> ⚠️ Please select a valid symptom."))
            return

        display(Markdown(
f"""### 🧾 Risk Assessment
**Symptom:** `{res['symptom']}`  
**Common Disease Group:** `{res['common_group']}`  
**Disease Group:** `{res['disease_group']}`  
**Dosha (cleaned):** `{res['dosha']}`  

**Group Weight:** `{res['group_weight']}`  
**Dosha Weight:** `{res['dosha_weight']}`  

**Formula:** `{res['formula']}`  
**Risk Score (0–10):** `{res['risk_score_0_10']}`  
**Risk Level:** **{res['risk_level']}**"""
        ))

inp.observe(on_text_change, names="value")
sugg_list.observe(on_select_change, names="value")
pred_btn.on_click(on_predict)

display(inp, sugg_list, pred_btn, out)


Text(value='', description='Symptom:', layout=Layout(width='80%'), placeholder='Type or pick a symptom…')

Select(layout=Layout(width='80%'), options=(), rows=6, value=None)

Button(button_style='success', description='Predict Risk', layout=Layout(height='55px', width='220px'), style=…

Output()