# Disease Prediction from Symptoms using Machine Learning a Flask App

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, Input, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score

# Custom Attention Layer
class AttentionLayer(Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1], input_shape[-1]), initializer='glorot_uniform', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[-1],), initializer='zeros', trainable=True)
        self.u = self.add_weight(name='context_vector', shape=(input_shape[-1],), initializer='glorot_uniform', trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        score = tf.nn.tanh(tf.tensordot(x, self.W, axes=[2, 0]) + self.b)
        attention_weights = tf.nn.softmax(tf.tensordot(score, self.u, axes=[2, 0]), axis=1)
        context_vector = tf.reduce_sum(attention_weights[..., tf.newaxis] * x, axis=1)
        return context_vector

# Load and preprocess the dataset
file_path = 'Symptom2Disease.csv'
data = pd.read_csv(file_path)
data = data[['label', 'text']]

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

data['text'] = data['text'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

# Prepare data for feature extraction and modeling
X = data['text']
y = data['label_encoded']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Tokenize text for LSTM
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_sequence_length = 150
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Build enhanced LSTM model with Attention mechanism
input_layer = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(input_dim=10000, output_dim=128, input_length=max_sequence_length)(input_layer)
lstm_layer = Bidirectional(LSTM(128, return_sequences=True, dropout=0.3))(embedding_layer)
attention_layer = AttentionLayer()(lstm_layer)
dense_layer = Dense(128, activation='relu')(attention_layer)
dropout_layer = Dropout(0.4)(dense_layer)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Custom Callback to Display Precision, Recall, and F1-Score After Every Epoch
class MetricsCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        y_pred_probs = model.predict(X_test_pad, verbose=0)
        y_pred = np.argmax(y_pred_probs, axis=1)

        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        print(f"\nEpoch {epoch+1}: Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

# Train the model with the custom callback
history = model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), 
                    epochs=25, batch_size=32, verbose=1, callbacks=[MetricsCallback()])

# Evaluate the model
accuracy = model.evaluate(X_test_pad, y_test, verbose=0)[1]
print(f"Enhanced Model with Attention Accuracy: {accuracy * 100:.2f}%")

# Function to predict disease from user input
def predict_disease(symptoms):
    symptoms_cleaned = clean_text(symptoms)
    seq = tokenizer.texts_to_sequences([symptoms_cleaned])
    pad = pad_sequences(seq, maxlen=max_sequence_length)
    pred = model.predict(pad)
    predicted_label = label_encoder.inverse_transform([np.argmax(pred)])[0]
    return predicted_label


Epoch 1/25
Epoch 1: Precision: 0.0016, Recall: 0.0395, F1-Score: 0.0030
Epoch 2/25


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2: Precision: 0.0016, Recall: 0.0395, F1-Score: 0.0030
Epoch 3/25
 1/32 [..............................] - ETA: 5s - loss: 3.2654 - accuracy: 0.0312

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3: Precision: 0.0196, Recall: 0.0751, F1-Score: 0.0272
Epoch 4/25
 1/32 [..............................] - ETA: 5s - loss: 3.2309 - accuracy: 0.0625

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4: Precision: 0.1787, Recall: 0.2372, F1-Score: 0.1408
Epoch 5/25
 1/32 [..............................] - ETA: 5s - loss: 3.1029 - accuracy: 0.1562

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5: Precision: 0.4038, Recall: 0.4348, F1-Score: 0.3605
Epoch 6/25
 1/32 [..............................] - ETA: 5s - loss: 2.4580 - accuracy: 0.3750

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6: Precision: 0.5304, Recall: 0.5731, F1-Score: 0.5221
Epoch 7/25
 1/32 [..............................] - ETA: 5s - loss: 1.2650 - accuracy: 0.5938

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7: Precision: 0.6617, Recall: 0.6680, F1-Score: 0.6431
Epoch 8/25
 1/32 [..............................] - ETA: 5s - loss: 0.7334 - accuracy: 0.7188

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 8: Precision: 0.7882, Recall: 0.7905, F1-Score: 0.7787
Epoch 9/25
 1/32 [..............................] - ETA: 5s - loss: 0.4489 - accuracy: 0.9062

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 9: Precision: 0.8297, Recall: 0.8142, F1-Score: 0.8109
Epoch 10/25
 1/32 [..............................] - ETA: 5s - loss: 0.5079 - accuracy: 0.7812

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 10: Precision: 0.7895, Recall: 0.7747, F1-Score: 0.7664
Epoch 11/25
 1/32 [..............................] - ETA: 5s - loss: 0.2589 - accuracy: 0.9062

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 11: Precision: 0.8539, Recall: 0.8379, F1-Score: 0.8358
Epoch 12/25
Epoch 12: Precision: 0.8511, Recall: 0.8300, F1-Score: 0.8259
Epoch 13/25
 1/32 [..............................] - ETA: 5s - loss: 0.1149 - accuracy: 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 13: Precision: 0.8569, Recall: 0.8498, F1-Score: 0.8437
Epoch 14/25
Epoch 14: Precision: 0.8780, Recall: 0.8617, F1-Score: 0.8626
Epoch 15/25
Epoch 15: Precision: 0.8825, Recall: 0.8656, F1-Score: 0.8662
Epoch 16/25
 1/32 [..............................] - ETA: 5s - loss: 0.0690 - accuracy: 0.9688

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 16: Precision: 0.8804, Recall: 0.8775, F1-Score: 0.8724
Epoch 17/25
 1/32 [..............................] - ETA: 5s - loss: 0.0666 - accuracy: 0.9688

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 17: Precision: 0.8913, Recall: 0.8775, F1-Score: 0.8780
Epoch 18/25
 1/32 [..............................] - ETA: 5s - loss: 0.0193 - accuracy: 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 18: Precision: 0.8902, Recall: 0.8854, F1-Score: 0.8820
Epoch 19/25
 1/32 [..............................] - ETA: 5s - loss: 0.0308 - accuracy: 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 19: Precision: 0.9029, Recall: 0.8735, F1-Score: 0.8778
Epoch 20/25
Epoch 20: Precision: 0.8822, Recall: 0.8775, F1-Score: 0.8748
Epoch 21/25
 1/32 [..............................] - ETA: 6s - loss: 0.0243 - accuracy: 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 21: Precision: 0.8901, Recall: 0.8854, F1-Score: 0.8827
Epoch 22/25
 1/32 [..............................] - ETA: 5s - loss: 0.0155 - accuracy: 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 22: Precision: 0.9011, Recall: 0.8893, F1-Score: 0.8821
Epoch 23/25
Epoch 23: Precision: 0.8994, Recall: 0.8933, F1-Score: 0.8906
Epoch 24/25
 1/32 [..............................] - ETA: 5s - loss: 0.0252 - accuracy: 1.0000

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 24: Precision: 0.9114, Recall: 0.9012, F1-Score: 0.9012
Epoch 25/25
Epoch 25: Precision: 0.9093, Recall: 0.8933, F1-Score: 0.8901
Enhanced Model with Attention Accuracy: 89.33%


In [3]:
model.save("disease_prediction_model.h5")

In [4]:
import pickle

# Assuming tokenizer and label_encoder are already created in your training script
with open("preprocessing.pkl", "wb") as f:
    pickle.dump({"tokenizer": tokenizer, "label_encoder": label_encoder}, f)

print("✅ preprocessing.pkl saved successfully!")


✅ preprocessing.pkl saved successfully!


In [5]:
# Example user interaction
user_input = input("Enter symptoms: ")
prediction = predict_disease(user_input)
print(f"Predicted Disease: {prediction}")

Enter symptoms:  fever headache 


Predicted Disease: Dengue
