In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [55]:
data = pd.read_csv("new_dataset.csv", index_col=0)

In [56]:
data.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,
5,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,
6,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,
7,Fungal infection,itching,skin_rash,dischromic _patches,,,
8,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,
9,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,


In [57]:
label_encoders = {}
for col in data.columns:
    if data[col].dtype == 'object':
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le 

In [58]:
X = data.drop(["Disease"], axis=1)
y = data["Disease"]

In [59]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [61]:
log_model = LogisticRegression()

In [62]:
rf_model = RandomForestClassifier()

In [63]:
gb_model = GradientBoostingClassifier()

In [64]:
log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

In [65]:
def evaluate(model, name):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    return acc

In [66]:
acc_log = evaluate(log_model, "Logistic Regression")
acc_rf = evaluate(rf_model, "Random Forest")
acc_gb = evaluate(gb_model, "Gradient Boosting")


Logistic Regression Accuracy: 0.8404
              precision    recall  f1-score   support

           0       0.75      0.83      0.79        18
           1       0.67      0.53      0.59        30
           2       0.68      0.79      0.73        24
           3       0.85      0.92      0.88        25
           4       0.60      1.00      0.75        24
           5       0.90      0.78      0.84        23
           6       0.80      0.97      0.88        33
           7       1.00      0.87      0.93        23
           8       1.00      0.90      0.95        21
           9       1.00      1.00      1.00        15
          10       0.84      0.70      0.76        23
          11       0.69      0.69      0.69        26
          12       0.70      0.90      0.79        21
          13       1.00      0.86      0.93        29
          14       1.00      0.67      0.80        24
          15       0.64      0.84      0.73        19
          16       0.85      1.00      0.92

In [67]:
if acc_log > 0.8 and acc_rf > 0.8 and acc_gb > 0.8:
    voting_clf = VotingClassifier(
        estimators=[('lr', log_model), ('rf', rf_model), ('gb', gb_model)],
        voting='hard'
    )
    voting_clf.fit(X_train, y_train)
    evaluate(voting_clf, "Voting Classifier")

    joblib.dump(voting_clf, 'final_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(label_encoders, 'encoders.pkl')
    print("Final model and encoders saved!")
else:
    print("Consider tuning the underperforming model(s).")


Voting Classifier Accuracy: 0.9980
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        30
           2       1.00      1.00      1.00        24
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        24
           5       1.00      1.00      1.00        23
           6       1.00      1.00      1.00        33
           7       1.00      1.00      1.00        23
           8       1.00      1.00      1.00        21
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00        23
          11       1.00      1.00      1.00        26
          12       1.00      1.00      1.00        21
          13       1.00      1.00      1.00        29
          14       1.00      1.00      1.00        24
          15       1.00      1.00      1.00        19
          16       1.00      1.00      1.00  