In [8]:
import pandas as pd

# Load the dataset
file_path = 'modified_file_with_symptoms.csv'
df = pd.read_csv(file_path)

# Display the column names to understand the structure of the dataset
print(df.columns)


Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')


In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('modified_file_with_symptoms.csv')

# Collecting all symptoms columns
symptom_columns = [col for col in df.columns if 'Symptom' in col]
disease_column = 'Disease'

# Preprocess the dataset
x_train = df[symptom_columns].values.tolist()
y_train = df[disease_column].tolist()

# Flatten the symptoms for LabelEncoder
all_symptoms = set(symptom for symptoms in x_train for symptom in symptoms if pd.notna(symptom))
all_symptoms = list(all_symptoms)  # Convert to list

# Initialize LabelEncoder and OneHotEncoder
le = LabelEncoder()
ohe = OneHotEncoder(sparse_output=False)

# Fit the LabelEncoder to the entire symptom dataset
le.fit(all_symptoms)

# Label encode the x_train data
x_train_labeled = [le.transform([symptom for symptom in symptoms if pd.notna(symptom)]) for symptoms in x_train]

# Fit the OneHotEncoder to the entire set of label-encoded symptoms
ohe.fit(le.transform(all_symptoms).reshape(-1, 1))

# One-hot encode the reshaped x_train data
x_train_ohe = [ohe.transform(np.asarray(labeled).reshape(-1, 1)).flatten() for labeled in x_train_labeled]

# Ensure all rows have the same length by padding with zeros if necessary
max_length = max(len(row) for row in x_train_ohe)
x_train_ohe_padded = np.array([np.pad(row, (0, max_length - len(row)), 'constant') for row in x_train_ohe])

# Prepare y_train
le_y = LabelEncoder()
y_train_encoded = le_y.fit_transform(y_train)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(x_train_ohe_padded, y_train_encoded, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestClassifier()

# Train the model
model.fit(X_train, Y_train)

# Save the model and encoders
joblib.dump(model, 'disease_prediction_model.joblib')
joblib.dump(le_y, 'label_encoder_y.joblib')
joblib.dump(ohe, 'one_hot_encoder_x.joblib')
joblib.dump(le, 'label_encoder_x.joblib')

print("Model training complete and saved.")


            Disease   Symptom_1              Symptom_2              Symptom_3  \
0  Fungal infection     itching              skin_rash   nodal_skin_eruptions   
1  Fungal infection   skin_rash   nodal_skin_eruptions    dischromic _patches   
2  Fungal infection     itching   nodal_skin_eruptions    dischromic _patches   
3  Fungal infection     itching              skin_rash    dischromic _patches   
4  Fungal infection     itching              skin_rash   nodal_skin_eruptions   

              Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0   dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                   NaN       NaN       NaN       NaN       NaN       NaN   
2                   NaN       NaN       NaN       NaN       NaN       NaN   
3                   NaN       NaN       NaN       NaN       NaN       NaN   
4                   NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_10 Symptom_11 Symptom_12 Symptom_13 Symp

In [17]:
import numpy as np
import joblib

# Load the model and encoders
model = joblib.load('disease_prediction_model.joblib')
le_y = joblib.load('label_encoder_y.joblib')
ohe = joblib.load('one_hot_encoder_x.joblib')
le = joblib.load('label_encoder_x.joblib')

# Example input symptoms for prediction
input_data = ( "vomiting", "headache"," altered_sensorium")

# Verify if all input data labels are present in the encoder classes
missing_labels = [label for label in input_data if label not in le.classes_]
if missing_labels:
    print(f"Missing labels in encoder: {missing_labels}")
    # Handle missing labels here, e.g., raise an error or add handling logic

# Label encode the input data
try:
    labeled = le.transform(input_data)
except ValueError as e:
    print(f"Error during label encoding: {e}")
    # Handle the error or add missing labels logic here

# Convert labeled data to a NumPy array and reshape it
input_data_as_array = np.asarray(labeled).reshape(-1, 1)

# One-hot encode the reshaped data
labeled_ohe = ohe.transform(input_data_as_array).flatten()

# Ensure the input data has the correct shape for prediction
max_length = x_train_ohe_padded.shape[1]  # Ensure this matches training data
labeled_ohe = np.pad(labeled_ohe, (0, max_length - len(labeled_ohe)), 'constant').reshape(1, -1)

# Predict the disease
predicted_disease_encoded = model.predict(labeled_ohe)

# Decode the predicted disease
predicted_disease = le_y.inverse_transform(predicted_disease_encoded)
print(f'Predicted Disease: {predicted_disease[0]}')


Missing labels in encoder: ['vomiting', 'headache']
Error during label encoding: y contains previously unseen labels: 'vomiting'
Predicted Disease: Cervical spondylosis
