In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Lambda
import tensorflow as tf




In [3]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\heman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
file_path ="ano.xlsx"

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

In [66]:
df.head()

Unnamed: 0,Disease,Age,Symptoms_Description
0,Influenza (Flu),30,"I woke up suddenly with a high fever, around ..."
1,Dengue,28,"The fever hit me like a wave, reaching 104°F...."
2,Malaria,32,I've been feeling unwell for the past few day...
3,Typhoid,27,"I've been having a prolonged fever, around 10..."
4,Common Cold,25,"I caught a cold, and it started with sneezing..."


In [67]:
#df['Disease'].unique()

In [68]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [69]:
df['Disease'].unique()

array(['Influenza (Flu)', 'Dengue', 'Malaria', 'Typhoid', 'Common Cold'],
      dtype=object)

In [70]:
lemmatizer = WordNetLemmatizer()

In [71]:
#nltk stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'by', 'what', 'the', "she's", 'won', 'my', 'no', 'his', 'off', 'wouldn', 'not', 'that', 'about', "didn't", 'in', 'him', 'd', 've', 'during', "you'd", 'am', 'until', 'she', 'yourself', 'further', 's', 'doing', 'its', 'themselves', 'out', 't', 'should', 'such', 'each', 'aren', 'into', 'ain', "won't", "weren't", 'can', 'was', 'most', 'here', 'very', 'himself', 'her', 'itself', 'couldn', 'haven', 'has', 'they', 'i', 'this', 'up', 'ours', "you'll", 're', "isn't", 'to', 'don', "mightn't", 'yourselves', 'any', 'own', 'he', 'shan', 'why', 'at', "couldn't", "needn't", 'do', 'for', 'are', 'both', 'whom', "hadn't", 'or', "hasn't", 'before', 'as', 'mightn', 'm', "that'll", 'just', 'same', 'which', 'our', 'be', 'does', 'mustn', 'few', 'isn', "aren't", 'their', 'o', 'more', 'after', 'so', 'other', 'did', "it's", 'theirs', 'y', 'you', 'them', 'above', 'too', "wouldn't", 'hasn', 'an', 'over', "shouldn't", "shan't", 'some', "you've", 'down', 'there', 'how', 'had', 'when', 'if', 'hers', 'where', 'from'

In [72]:
#spacy stopwords
#import spacy
#nlp = spacy.load("en_core_web_sm")
#stop_words = spacy.lang.en.stop_words.STOP_WORDS
#print(stop_words)

In [73]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

In [74]:
df.columns

Index(['Disease', 'Age', 'Symptoms_Description'], dtype='object')

In [75]:
df['Age'] = df['Age'].astype(int)

In [76]:
df.columns

Index(['Disease', 'Age', 'Symptoms_Description'], dtype='object')

In [77]:
df['Cleaned_Symptoms'] = df['Symptoms_Description'].apply(preprocess_text)

In [78]:
label_encoder = LabelEncoder()
df['Encoded_Disease'] = label_encoder.fit_transform(df['Disease'])

In [79]:
X_train, X_test, y_train, y_test = train_test_split(
    df['Cleaned_Symptoms'], df['Encoded_Disease'], test_size=0.2, random_state=42
)

In [80]:
df['Cleaned_Symptoms'][4]

'caught cold started sneezing runny nose throat feel scratchy sore cough thats bothering there mucus dripping throat also feel bit feverish day symptom improving much'

In [81]:
# Tokenization and Padding
max_words = 1000  # Adjust based on your data
max_length = 20  # Adjust based on your data

In [82]:
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [83]:
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, truncating='post')

In [84]:
# embedding_vector_features = 40
# temperature_parameter = 0.8  # Adjust based on your preference

# input_text = Input(shape=(max_length,))
# embedding_layer = Embedding(max_words, embedding_vector_features, input_length=max_length)(input_text)
# lstm_layer = LSTM(100)(embedding_layer)
# dropout_layer = Dropout(0.3)(lstm_layer)

# scaled_temperature = Lambda(lambda x: x / temperature_parameter)

# concatenated_layer = tf.keras.layers.concatenate([dropout_layer, scaled_temperature])

# dense_layer = Dense(df['Encoded_Disease'].nunique(), activation='softmax')(concatenated_layer)

# model = Model(inputs=[input_text], outputs=dense_layer)

# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())


In [85]:
embedding_vector_features = 40

model = Sequential()
model.add(Embedding(max_words, embedding_vector_features, input_length=max_length))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(df['Encoded_Disease'].nunique(), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 20, 40)            40000     
                                                                 
 lstm_1 (LSTM)               (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 505       
                                                                 
Total params: 96905 (378.54 KB)
Trainable params: 96905 (378.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [86]:
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1c0ce7ba190>

In [87]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Loss: 0.6903509497642517
Test Accuracy: 73.17%


In [88]:
len(df['Disease'].unique())

5

In [90]:
mxl=20
new_symptom = input()
new_symptom = preprocess_text(new_symptom)
print(new_symptom)
new_symptom_sequence = tokenizer.texts_to_sequences([new_symptom])
print(new_symptom_sequence)
new_symptom_padded = pad_sequences(new_symptom_sequence, maxlen=mxl, truncating='post')
print(new_symptom_padded)

predictions = model.predict([new_symptom_padded])[0]

# Get the top N predictions along with their probabilities
top_n = len(df['Disease'].unique())  # You can adjust this based on how many top predictions you want
top_indices = tf.math.top_k(predictions, k=top_n).indices.numpy()
top_probabilities = tf.math.top_k(predictions, k=top_n).values.numpy()

# Decode the label indices to actual disease labels
top_diseases = label_encoder.inverse_transform(top_indices)

# Print the results
for i in range(top_n):
    print(f"Prediction {i + 1}: {top_diseases[i]}, Probability: {top_probabilities[i]}")



[[]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Prediction 1: Typhoid, Probability: 0.4038892686367035
Prediction 2: Common Cold, Probability: 0.19351652264595032
Prediction 3: Dengue, Probability: 0.17699331045150757
Prediction 4: Malaria, Probability: 0.12420020997524261
Prediction 5: Influenza (Flu), Probability: 0.10140068084001541


In [67]:
df['Disease'].unique()

array(['Influenza (Flu)', 'Dengue', 'Malaria', 'Typhoid', 'Common Cold'],
      dtype=object)