In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Step 1: Load dataset
df = pd.read_csv('/Training.csv')
df.columns = df.columns.str.strip()  # remove whitespace in columns
df['prognosis'] = df['prognosis'].str.strip()

# Step 2: Filter classes with at least 2 samples (to avoid stratify error)
class_counts = df['prognosis'].value_counts()
valid_classes = class_counts[class_counts >= 2].index
df_filtered = df[df['prognosis'].isin(valid_classes)].reset_index(drop=True)

print(f"Total samples after filtering: {len(df_filtered)}")
print(f"Unique classes: {df_filtered['prognosis'].nunique()}")

# Step 3: Prepare features and target
X = df_filtered.drop('prognosis', axis=1).values
y = df_filtered['prognosis']

# Step 4: Encode target labels
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
num_classes = len(np.unique(y_encoded))
y_categorical = to_categorical(y_encoded, num_classes=num_classes)

# Step 5: Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

# Step 6: Build the neural network model
model = Sequential([
    Dense(128, input_shape=(X.shape[1],), activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

# Step 7: Compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Step 8: Train model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Step 9: Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Step 10: Predict on a new sample (example)
# Replace below with actual symptom input matching feature order (length == X.shape[1])
sample_input = np.array([[1, 1, 1] + [0]*(X.shape[1]-3)])  # Example: 1s for first 3 symptoms, rest 0



pred = model.predict(sample_input)
predicted_class = encoder.inverse_transform([np.argmax(pred)])
print("Predicted disease for sample input:", predicted_class[0])


Total samples after filtering: 4920
Unique classes: 41
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.2086 - loss: 3.3176 - val_accuracy: 0.9429 - val_loss: 1.4111
Epoch 2/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8944 - loss: 1.1110 - val_accuracy: 0.9975 - val_loss: 0.1145
Epoch 3/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9745 - loss: 0.2429 - val_accuracy: 1.0000 - val_loss: 0.0243
Epoch 4/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9897 - loss: 0.1182 - val_accuracy: 1.0000 - val_loss: 0.0091
Epoch 5/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9913 - loss: 0.0690 - val_accuracy: 1.0000 - val_loss: 0.0047
Epoch 6/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9934 - loss: 0.0462 - val_accuracy: 1.0000 - val_loss: 0.0029
Epoch 7/50
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━

In [4]:
import numpy as np

def predict_disease_from_text(input_text, model, encoder, df):
    """
    Predict disease from a text input of symptom names.

    Args:
        input_text (str): User input text containing symptom keywords.
        model (keras.Model): Trained neural network model.
        encoder (LabelEncoder): Fitted label encoder for 'prognosis'.
        df (pd.DataFrame): The original dataset DataFrame used for training.

    Returns:
        str: Predicted disease (prognosis).
    """
    # Get list of symptom columns (features) — assume all except 'prognosis'
    symptom_columns = [col for col in df.columns if col != 'prognosis']

    # Lowercase input for case-insensitive matching
    input_text_lower = input_text.lower()

    # Create feature vector filled with 0
    input_features = np.zeros(len(symptom_columns), dtype=int)

    # Map symptoms mentioned in input_text to feature vector
    for idx, symptom in enumerate(symptom_columns):
        # For matching, normalize symptom name by replacing underscores with spaces, lowercase
        symptom_name = symptom.replace('_', ' ').lower()

        # Check if symptom_name words are in the input text (simple substring match)
        # You can improve this with more advanced NLP if desired
        if symptom_name in input_text_lower:
            input_features[idx] = 1

    # Reshape and predict
    input_features_reshaped = input_features.reshape(1, -1)
    prediction_probs = model.predict(input_features_reshaped)
    predicted_index = np.argmax(prediction_probs, axis=1)[0]
    predicted_disease = encoder.inverse_transform([predicted_index])[0]

    return predicted_disease


In [11]:
# Example user input
user_input = "Itching, skin rash, and red sore around nose"
# Call the prediction function
disease_prediction = predict_disease_from_text(user_input, model, encoder, df)

print("Predicted Disease:", disease_prediction)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Predicted Disease: Impetigo


In [12]:
import pickle

# Save the label encoder
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(encoder, le_file)

In [15]:
# Save the entire model to a HDF5 file
model.save('disease_prediction_model.h5')
print("Model saved to disease_prediction_model.h5")




Model saved to disease_prediction_model.h5
