In [None]:

import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


df = pd.read_csv("dataset.csv")

symptom_cols = [col for col in df.columns if col.startswith("Symptom")]
df_clean = df.dropna(subset=symptom_cols, how='all')


def combine_symptoms(row):
    return [str(s).strip() for s in row[symptom_cols] if pd.notna(s) and s != ""]

df_clean["Symptoms_List"] = df_clean.apply(combine_symptoms, axis=1)
print("Dataset with Combined Symptoms:")
print(df_clean[["Disease", "Symptoms_List"]].head())


mlb = MultiLabelBinarizer()
symptom_encoded = mlb.fit_transform(df_clean["Symptoms_List"])
df_symptoms = pd.DataFrame(symptom_encoded, columns=mlb.classes_)
print("\nOne-Hot Encoded Symptom Columns:")
print(df_symptoms.head())

df_final = pd.concat([df_clean[["Disease"]], df_symptoms], axis=1)
print("\nFinal Preprocessed Dataset:")
print(df_final.tail())


X = df_final.drop(columns=["Disease"])
y = df_final["Disease"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)





Dataset with Combined Symptoms:
            Disease                                      Symptoms_List
0  Fungal infection  [itching, skin_rash, nodal_skin_eruptions, dis...
1  Fungal infection  [skin_rash, nodal_skin_eruptions, dischromic _...
2  Fungal infection  [itching, nodal_skin_eruptions, dischromic _pa...
3  Fungal infection          [itching, skin_rash, dischromic _patches]
4  Fungal infection         [itching, skin_rash, nodal_skin_eruptions]

One-Hot Encoded Symptom Columns:
   abdominal_pain  abnormal_menstruation  acidity  acute_liver_failure  \
0               0                      0        0                    0   
1               0                      0        0                    0   
2               0                      0        0                    0   
3               0                      0        0                    0   
4               0                      0        0                    0   

   altered_sensorium  anxiety  back_pain  belly_pain  blackhead