In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


In [26]:
# Load
train_df = pd.read_csv("Training.csv")
test_df  = pd.read_csv("Testing.csv")

# Remove unwanted unnamed columns
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
test_df  = test_df.loc[:, ~test_df.columns.str.contains('^Unnamed')]

# Fill missing values with 0 (no symptom)
train_df = train_df.fillna(0)
test_df  = test_df.fillna(0)

# Drop duplicates
train_df = train_df.drop_duplicates().reset_index(drop=True)
test_df  = test_df.drop_duplicates().reset_index(drop=True)

# Verify
print("Training shape:", train_df.shape)
print("Testing shape :", test_df.shape)
print("Unique diseases in training:", train_df['prognosis'].nunique())


Training shape: (304, 133)
Testing shape : (42, 133)
Unique diseases in training: 41


In [None]:
# Separate Features & Labels
X = train_df.drop("prognosis", axis=1)
y = train_df["prognosis"]

# Encode the disease names
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Number of symptoms:", X.shape[1])
print("Number of diseases:", len(le.classes_))


Number of symptoms: 132
Number of diseases: 41


In [None]:
# Split Data into Validation & Training set
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)


Training set: (243, 132)
Validation set: (61, 132)


In [None]:
# Build and Train the Random Forest Model
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))


Validation Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         2
          10       1.00      1.00      1.00         2
          11       1.00      1.00      1.00         2
          12       1.00      1.00      1.00         2
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         1
          16       1.00      1.

In [None]:
# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42),
                    param_grid, cv=3, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)

best_rf = grid.best_estimator_
val_preds = best_rf.predict(X_val)
print("Tuned Validation Accuracy:", accuracy_score(y_val, val_preds))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Tuned Validation Accuracy: 1.0


In [None]:
# Final Test
X_test = test_df.drop("prognosis", axis=1)
y_test = le.transform(test_df["prognosis"])

test_preds = best_rf.predict(X_test)
print("Final Test Accuracy:", accuracy_score(y_test, test_preds))


Final Test Accuracy: 0.9761904761904762


In [20]:
import pickle

# Save trained model
pickle.dump(best_rf, open("disease_model.pkl", "wb"))

# Save label encoder
pickle.dump(le, open("label_encoder.pkl", "wb"))

print("‚úÖ Model and label encoder saved successfully!")


‚úÖ Model and label encoder saved successfully!


In [32]:
symptom_list = list(X.columns)

def predict_disease(user_symptoms):
    input_data = [0] * len(symptom_list)
    for s in user_symptoms:
        s = s.strip().lower()
        if s in symptom_list:
            input_data[symptom_list.index(s)] = 1
    pred = best_rf.predict([input_data])[0]
    return le.inverse_transform([pred])[0]

# Example prediction
print(predict_disease(['itching', 'skin_rash', 'nodal_skin_eruptions']))


Fungal infection




In [34]:
print("Enter your symptoms one by one (type 'done' when finished):")

user_symptoms = []
while True:
    s = input("Symptom: ").strip().lower()
    if s == 'done':
        break
    elif s not in symptom_list:
        print("‚ö†Ô∏è Not recognized. Try again.")
    else:
        user_symptoms.append(s)

if len(user_symptoms) == 0:
    print("No symptoms entered.")
else:
    disease = predict_disease(user_symptoms)
    print("\nü©∫ Predicted Disease:", disease)


Enter your symptoms one by one (type 'done' when finished):
Symptom: itching
Symptom: chills
Symptom: shivering
Symptom: done

ü©∫ Predicted Disease: Allergy


