In [1]:
# ==============================================
# Disease Predictor - Colab Ready (Final Version)
# ==============================================

import pandas as pd
import re, itertools, random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import gradio as gr

# -------------------------
# 1) Upload CSV into Colab
# -------------------------
from google.colab import files
uploaded = files.upload()   # choose your data.csv file

csv_path = list(uploaded.keys())[0]
print("CSV uploaded:", csv_path)

# -------------------------
# 2) Load and inspect dataset
# -------------------------
df = pd.read_csv(csv_path)
df.columns = df.columns.str.strip().str.replace('\ufeff', '')

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head(10))

# -------------------------
# 3) Parse symptoms into lists
# -------------------------
def parse_symptoms(s):
    if pd.isna(s):
        return []
    parts = re.split(r'[;,/|]+', str(s))
    return [t.strip().lower() for t in parts if t.strip()]

if 'Symptoms' not in df.columns:
    raise KeyError("CSV must contain a 'Symptoms' column!")

df['symptom_list'] = df['Symptoms'].apply(parse_symptoms)
df['symptom_text'] = df['symptom_list'].apply(lambda L: ' '.join(L))

# -------------------------
# 4) Augment dataset (permute symptoms to help learning)
# -------------------------
def augment_combinations(df, max_per_disease=50):
    rows = []
    for disease, g in df.groupby('Possible Disease'):
        symptom_pool = set()
        for L in g['symptom_list']:
            symptom_pool.update(L)
        symptom_pool = list(symptom_pool)
        # include original
        for L in g['symptom_list']:
            rows.append({'symptom_text':' '.join(L), 'Possible Disease': disease})
        # make synthetic combos
        combos = []
        for r in range(1, min(4, len(symptom_pool)+1)):
            combos += list(itertools.combinations(symptom_pool, r))
        random.shuffle(combos)
        for i, comb in enumerate(combos):
            if i >= max_per_disease:
                break
            rows.append({'symptom_text':' '.join(comb), 'Possible Disease': disease})
    return pd.DataFrame(rows)

augmented = augment_combinations(df, max_per_disease=50)
print("Original rows:", len(df), "Augmented rows:", len(augmented))

# -------------------------
# 5) Train/test split + train model
# -------------------------
train_df = augmented.sample(frac=1, random_state=42).reset_index(drop=True)
X_text = train_df['symptom_text'].fillna('')
y = train_df['Possible Disease']

vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(X_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

preds = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))

# -------------------------
# 6) Save model + vectorizer
# -------------------------
joblib.dump(model, "disease_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
print("Saved -> disease_model.pkl, vectorizer.pkl")

# -------------------------
# 7) Prediction function
# -------------------------
def predict_from_symptoms(symptom_text):
    parts = re.split(r'[;,/|]+', str(symptom_text))
    tokens = [t.strip().lower() for t in parts if t.strip()]
    text = ' '.join(tokens)
    vec = vectorizer.transform([text])
    disease = model.predict(vec)[0]
    # remedy/harm lookup
    row = df[df['Possible Disease'] == disease].iloc[0]
    remedy = row.get('Remedies (first-aid style)', 'No remedy available')
    harm = row.get('Harm Scale (0=safe,3=serious)', 'Unknown')
    return f"Disease: {disease}\nRemedy: {remedy}\nHarm Scale: {harm}"

# -------------------------
# 8) Gradio Web App
# -------------------------
iface = gr.Interface(
    fn=predict_from_symptoms,
    inputs=gr.Textbox(lines=2, placeholder="Enter symptoms separated by commas, e.g. cough, fever"),
    outputs="text",
    title="Disease Predictor",
    description="Enter symptoms (comma-separated) to predict likely disease and remedy/harm scale."
)
iface.launch(share=True)


Saving data.csv to data (3).csv
CSV uploaded: data (3).csv
Shape: (20000, 7)
Columns: ['Symptoms', 'Possible Disease', 'Kid/Grandpa Explanation', 'Verified Remedies', 'Doctor Check', 'Key Note', 'Category']
                                   Symptoms      Possible Disease  \
0  Synthetic symptom cluster batch90k_87338  Rheumatoid Arthritis   
1  Synthetic symptom cluster batch90k_81315         Liver Disease   
2  Synthetic symptom cluster batch8900_8819       Gastroenteritis   
3  Synthetic symptom cluster batch90k_11069          Osteoporosis   
4  Synthetic symptom cluster batch90k_16899           Sarcoidosis   
5  Synthetic symptom cluster batch90k_16353                Plague   
6  Synthetic symptom cluster batch8900_4578              Diabetes   
7  Synthetic symptom cluster batch90k_69925             Arthritis   
8  Synthetic symptom cluster batch90k_32790              Dementia   
9  Synthetic symptom cluster batch90k_11835            Heatstroke   

                             Kid/

