In [2]:
import pandas as pd  # For handling data
import numpy as np   # For numerical operations
from sklearn.model_selection import train_test_split  # Splitting dataset
from sklearn.preprocessing import LabelEncoder  # Encoding text labels
from sklearn.ensemble import RandomForestClassifier  # ML model
from sklearn.metrics import accuracy_score  # For evaluating the model
import joblib  # For saving the model

In [4]:
df = pd.read_csv("symbipredict_2022.csv") 

print(df.head())
print(df.columns)
print(df.isnull().sum())

   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  \
0       0           0             0        0                 0  ...   
1       0           0             0        0                 0  ...   
2       0           0             0        0                 0  ...   
3       0           0             0        0                 0  ...   
4       0           0             0        0                 0  ...   

   blackheads  scurring  skin_peeling  silver_like_dusting  \
0           0         0             

In [5]:
# Separate features (X) and target (y)
X = df.drop(columns=['prognosis'])  # Symptoms as input
y = df['prognosis']  # Disease as output

# Convert disease names (text) into numbers
le = LabelEncoder()
y = le.fit_transform(y)

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 3968
Testing samples: 993


In [6]:
# Create a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Model Accuracy: 1.0


In [7]:
# Save the trained model
joblib.dump(model, "disease_prediction_model.pkl")

# Save the label encoder (for decoding predictions)
joblib.dump(le, "label_encoder.pkl")

print("Model saved successfully!")


Model saved successfully!
