In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
df = pd.read_csv('/content/drive/MyDrive/Colabfiles/Training.csv')


In [72]:
# installing the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import ipywidgets as widgets
from IPython.display import display

In [73]:
#using pandas
df = pd.read_csv(next(iter(uploaded)))
df.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [74]:
# preprocessing the data
X = df.drop(columns=['prognosis'])  # All symptom columns
y = df['prognosis']                 # Target disease

In [75]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [77]:
# Training model using XGboost algorithm and SMOTE
!pip install imbalanced-learn xgboost --quiet
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
# Applying SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# Train XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
model.fit(X_train_bal, y_train_bal)
print("Train Accuracy:", model.score(X_train_bal, y_train_bal))
print("Test Accuracy:", model.score(X_test, y_test))
# Evaluate
y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("🔍 Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Train Accuracy: 1.0
Test Accuracy: 1.0
✅ Accuracy: 1.0
🔍 Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            

In [78]:
# labelling encoder
joblib.dump(model, "disease_predictor_model.pkl")
joblib.dump(le, "disease_label_encoder.pkl")

['disease_label_encoder.pkl']

In [79]:
#Defining the disease prediction function
def predict_top_diseases(symptoms, model, label_encoder, top_n=3):
    input_vector = [0] * len(X.columns)
    for symptom in symptoms:
      if symptom in X.columns:
        idx = X.columns.get_loc(symptom)
        input_vector[idx] = 1
        probas = model.predict_proba([input_vector])[0]
    top_indices = probas.argsort()[-top_n:][::-1]
    top_diseases = label_encoder.inverse_transform(top_indices)
    return list(zip(top_diseases, probas[top_indices]))


In [80]:
# Adding user interface for better experience
multi_select = widgets.SelectMultiple(
    options=X.columns.tolist(),
    description="Symptoms",
    layout=widgets.Layout(width='60%', height='200px')
)

In [81]:
button = widgets.Button(description="Predict Disease")
output = widgets.Output()

In [82]:
#allows to select multiple symptoms, minimum 3 symptoms required to select
def on_button_clicked(b):
    output.clear_output()
    with output:
        selected = list(multi_select.value)
        if len(selected) < 3:
            print("Please select at least 3 symptoms for meaningful prediction.")
            return
        result = predict_top_diseases(selected, model, le, top_n=3)
        print("Top likely diseases based on your symptoms:\n")
        for i, (disease, prob) in enumerate(result, 1):
            print(f"{i}. {disease} — {prob*100:.1f}% chance")

In [83]:
button.on_click(on_button_clicked)

In [84]:
display(multi_select, button, output)

SelectMultiple(description='Symptoms', layout=Layout(height='200px', width='60%'), options=('itching', 'skin_r…

Button(description='Predict Disease', style=ButtonStyle())

Output()