In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping


import regex as re
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
main_data = pd.read_csv("DATA/Full_Dataset.csv")

In [3]:
main_data.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Dengue,fever,nausea,vomiting,rash,headache,pain_behind_the_eyes,joint_pain,muscle_pain,,,,,,,,,
1,Typhoid Fever,fever,fatigue,headache,nausea,abdominal_pain,rash,constipation,,,,,,,,,,
2,Typhoid Fever,fever,fatigue,headache,nausea,abdominal_pain,rash,diarrhea,,,,,,,,,,
3,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
4,Allergy,shivering,chills,watering_from_eyes,,,,,,,,,,,,,,
5,Allergy,continuous_sneezing,chills,watering_from_eyes,,,,,,,,,,,,,,
6,Allergy,continuous_sneezing,shivering,watering_from_eyes,,,,,,,,,,,,,,
7,Allergy,continuous_sneezing,shivering,chills,,,,,,,,,,,,,,
8,Common Cold,continuous_sneezing,chills,fatigue,cough,fever,headache,swelled_lymph_nodes,malaise,phlegm,throat_irritation,redness_of_eyes,sinus_pressure,runny_nose,congestion,chest_pain,loss_of_smell,muscle_pain
9,Common Cold,continuous_sneezing,chills,fatigue,cough,fever,headache,swelled_lymph_nodes,malaise,phlegm,throat_irritation,redness_of_eyes,sinus_pressure,runny_nose,congestion,chest_pain,loss_of_smell,muscle_pain


In [4]:
main_data.shape

(114, 18)

In [5]:
main_data.Disease.value_counts()

Disease
Common Cold             10
Fungal infection        10
Drug Reaction           10
Diabetes                10
Gastroenteritis         10
Migraine                10
Cervical spondylosis    10
Chronic cholestasis      9
Peptic ulcer diseae      9
Bronchial Asthma         9
Hypertension             9
Allergy                  5
Typhoid Fever            2
Dengue                   1
Name: count, dtype: int64

In [6]:
df = main_data.copy()

In [7]:
df.dropna(axis=1, how='all', inplace=True)

In [8]:
df.fillna(0, inplace=True)

----

In [9]:
class CustomLabelEncoder(LabelEncoder):
    def __init__(self, start=0):
        self.start = start
        super().__init__()
    
    def fit_transform(self, y):
        encoded = super().fit_transform(y)
        encoded += self.start
        return encoded

In [10]:
flattened_series = df['Disease'].astype(str)

In [11]:
encoder = CustomLabelEncoder(start=200) # Here we tell the label encoder to start encoding from 200

In [12]:
encoded_values = encoder.fit_transform(flattened_series)

In [13]:
df['Disease'] = encoded_values

In [14]:
mapping_data = {'label_encoder': encoder}

In [15]:
label_mapping = {k: v for k, v in zip(mapping_data['label_encoder'].classes_, range(200, 200+len(mapping_data['label_encoder'].classes_)))}

-----

In [16]:
df = main_data.copy()

In [17]:
df['All Symptoms'] = df.apply(lambda row: ','.join(row.dropna()), axis=1)
df['All Symptoms'] = df['All Symptoms'].apply(lambda x: ','.join(sorted(set(x.split(','))) if x else ''))

In [18]:
stay_cols= ['Disease', 'All Symptoms']
stay_cols

['Disease', 'All Symptoms']

In [19]:
df = df[stay_cols]

In [20]:
df.head()

Unnamed: 0,Disease,All Symptoms
0,Dengue,"Dengue,fever,headache,joint_pain,muscle_pain,n..."
1,Typhoid Fever,"Typhoid Fever,abdominal_pain,constipation,fati..."
2,Typhoid Fever,"Typhoid Fever,abdominal_pain,diarrhea,fatigue,..."
3,Allergy,"chills, continuous_sneezing, shivering, water..."
4,Allergy,"chills, shivering, watering_from_eyes,Allergy"


In [21]:
df['All Symptoms'][0]

'Dengue,fever,headache,joint_pain,muscle_pain,nausea,pain_behind_the_eyes,rash,vomiting'

In [22]:
def strip_to_basic_tokens(text):
    # Remove doble spaces and underscores
    text = re.sub(r'[_\s]+', ' ', text)
    # Split by commas and lowercase the tokens
    tokens = [token.strip().lower() for token in text.split(',')]
    return tokens

In [23]:
df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))

In [24]:
df = df.drop(['All Symptoms'], axis = 1)

In [25]:
df.head()

Unnamed: 0,Disease,Basic Tokens
0,Dengue,"dengue, fever, headache, joint pain, muscle pa..."
1,Typhoid Fever,"typhoid fever, abdominal pain, constipation, f..."
2,Typhoid Fever,"typhoid fever, abdominal pain, diarrhea, fatig..."
3,Allergy,"chills, continuous sneezing, shivering, wateri..."
4,Allergy,"chills, shivering, watering from eyes, allergy"


In [26]:
df['Basic Tokens'][0]

'dengue, fever, headache, joint pain, muscle pain, nausea, pain behind the eyes, rash, vomiting'

In [27]:
dfE = df.copy() # Taking a copy because we never know what might happen
dfE['Basic Tokens'] = dfE['Basic Tokens'].apply(lambda x: x.split(', '))

In [28]:
dfE['Basic Tokens'].head()

0    [dengue, fever, headache, joint pain, muscle p...
1    [typhoid fever, abdominal pain, constipation, ...
2    [typhoid fever, abdominal pain, diarrhea, fati...
3    [chills, continuous sneezing, shivering, water...
4     [chills, shivering, watering from eyes, allergy]
Name: Basic Tokens, dtype: object

In [29]:
mlb = MultiLabelBinarizer()
# Fit and transform the 'Basic Tokens' column
one_hot_encoded = pd.DataFrame(mlb.fit_transform(dfE['Basic Tokens']), columns=mlb.classes_, index=df.index)

In [30]:
# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([dfE, one_hot_encoded], axis=1)

In [31]:
df_encoded

Unnamed: 0,Disease,Basic Tokens,abdominal pain,acidity,allergy,back pain,blurred and distorted vision,breathlessness,bronchial asthma,burning micturition,...,swelled lymph nodes,throat irritation,typhoid fever,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,Dengue,"[dengue, fever, headache, joint pain, muscle p...",0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Typhoid Fever,"[typhoid fever, abdominal pain, constipation, ...",1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Typhoid Fever,"[typhoid fever, abdominal pain, diarrhea, fati...",1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Allergy,"[chills, continuous sneezing, shivering, water...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,Allergy,"[chills, shivering, watering from eyes, allergy]",0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,Cervical spondylosis,"[back pain, loss of balance, neck pain, weakne...",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
110,Cervical spondylosis,"[back pain, dizziness, neck pain, weakness in ...",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
111,Cervical spondylosis,"[back pain, dizziness, loss of balance, neck p...",0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
112,Cervical spondylosis,"[dizziness, loss of balance, neck pain, weakne...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [32]:
# Drop the 'Basic Tokens' column
df_encoded = df_encoded.drop(columns=['Basic Tokens'])
df_encoded.head()

Unnamed: 0,Disease,abdominal pain,acidity,allergy,back pain,blurred and distorted vision,breathlessness,bronchial asthma,burning micturition,cervical spondylosis,...,swelled lymph nodes,throat irritation,typhoid fever,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,Dengue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Typhoid Fever,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Typhoid Fever,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,Allergy,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,Allergy,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [33]:
df_encoded.shape

(114, 82)

In [34]:
disease_names = [key for key in label_mapping.keys()]
diseases = [strip_to_basic_tokens(disease) for disease in disease_names]

In [35]:
diseases

[['allergy'],
 ['bronchial asthma'],
 ['cervical spondylosis'],
 ['chronic cholestasis'],
 ['common cold'],
 ['dengue'],
 ['diabetes'],
 ['drug reaction'],
 ['fungal infection'],
 ['gastroenteritis'],
 ['hypertension'],
 ['migraine'],
 ['peptic ulcer diseae'],
 ['typhoid fever']]

In [36]:
diseases_cleaned = [item[0] if isinstance(item, list) else item for item in diseases]

In [37]:
diseases_cleaned

['allergy',
 'bronchial asthma',
 'cervical spondylosis',
 'chronic cholestasis',
 'common cold',
 'dengue',
 'diabetes',
 'drug reaction',
 'fungal infection',
 'gastroenteritis',
 'hypertension',
 'migraine',
 'peptic ulcer diseae',
 'typhoid fever']

In [38]:
df_encoded = df_encoded.drop(diseases_cleaned, axis = 1)

In [39]:
df_encoded.head()

Unnamed: 0,Disease,abdominal pain,acidity,back pain,blurred and distorted vision,breathlessness,burning micturition,chest pain,chills,congestion,...,sunken eyes,swelled lymph nodes,throat irritation,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,Dengue,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,Typhoid Fever,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Typhoid Fever,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Allergy,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,Allergy,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [40]:
df_encoded.shape

(114, 68)

In [41]:
model_features = df_encoded.columns.tolist()
model_features.remove("Disease")
X = df_encoded[model_features]
y = df_encoded["Disease"]

In [42]:
X

Unnamed: 0,abdominal pain,acidity,back pain,blurred and distorted vision,breathlessness,burning micturition,chest pain,chills,congestion,constipation,...,sunken eyes,swelled lymph nodes,throat irritation,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
110,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
111,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
112,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [43]:
y

0                    Dengue
1             Typhoid Fever
2             Typhoid Fever
3                   Allergy
4                   Allergy
               ...         
109    Cervical spondylosis
110    Cervical spondylosis
111    Cervical spondylosis
112    Cervical spondylosis
113    Cervical spondylosis
Name: Disease, Length: 114, dtype: object

In [44]:
y_encoded = pd.get_dummies(y)
y_encoded.shape

(114, 14)

In [45]:
y_encoded.head()

Unnamed: 0,Allergy,Bronchial Asthma,Cervical spondylosis,Chronic cholestasis,Common Cold,Dengue,Diabetes,Drug Reaction,Fungal infection,Gastroenteritis,Hypertension,Migraine,Peptic ulcer diseae,Typhoid Fever
0,False,False,False,False,False,True,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [47]:
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test.values, dtype=tf.float32)
X_eval_tensor = tf.convert_to_tensor(X_eval.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float64)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float64)
y_eval_tensor = tf.convert_to_tensor(y_eval, dtype=tf.float64)

In [48]:
with tf.device('/GPU:0'):
    model_2 = keras.Sequential([
        layers.Input(shape=(X_train_tensor.shape[1],)),
        layers.Dense(160, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(240, activation='tanh'),
        layers.BatchNormalization(),
        layers.Dense(240, activation='tanh'),
        layers.Dropout(0.2),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(160, activation='relu'),
        layers.Dense(y_train_tensor.shape[1], activation='softmax')])
    model_2.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, mode='max')
    history = model_2.fit(X_train_tensor, y_train_tensor, epochs=500, callbacks=[early_stopping],
                batch_size=16, validation_data=(X_eval_tensor, y_eval_tensor))

Epoch 1/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 69ms/step - accuracy: 0.0978 - loss: 2.5911 - val_accuracy: 0.5000 - val_loss: 2.4829
Epoch 2/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7991 - loss: 1.5112 - val_accuracy: 0.6429 - val_loss: 2.2955
Epoch 3/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9216 - loss: 0.7331 - val_accuracy: 0.7143 - val_loss: 2.0113
Epoch 4/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9767 - loss: 0.2862 - val_accuracy: 1.0000 - val_loss: 1.6989
Epoch 5/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.1398 - val_accuracy: 1.0000 - val_loss: 1.3592
Epoch 6/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 1.0000 - loss: 0.0433 - val_accuracy: 1.0000 - val_loss: 1.0842
Epoch 7/500
[1m6/6[0m [32m━━━━━━━━━━━━━

In [49]:
# If you remember in the first model, we took a row from the origial data to test the model
# We aren't going to do this here, let's REALLY test it
#user_input = ['stomach_pain','acidity','chest_pain'] # This should be GERD

user_input = ["continuous_sneezing", "chills", "fatigue", "cough", "fever", "headache"]
original_data = df_encoded.copy()

#  "skin_rash", "stomach_pain"	 "spotting_ urination"	

#  "acidity", "indigestion", "headache", "blurred_and_distorted_vision", "excessive_hunger", "stiff_neck", "irritability",  "visual_disturbances"


# We will change the strip_to_basic_tokens function just a little bit to be able to deal with the user input
def strip_to_basic_tokens(symptoms):
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]
# Apply strip_to_basic_tokens function to user input
user_input_stripped = strip_to_basic_tokens(user_input)

# Initialize MultiLabelBinarizer with all symptoms
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

# Fit and transform user input
user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

# Concatenate user input with original data
final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'],axis = 1)
# Print the final user input shape
final_user_input.head()

Unnamed: 0,abdominal pain,acidity,back pain,blurred and distorted vision,breathlessness,burning micturition,chest pain,chills,congestion,constipation,...,sunken eyes,swelled lymph nodes,throat irritation,visual disturbances,vomiting,watering from eyes,weakness in limbs,weight loss,yellowing of eyes,yellowish skin
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
user_tensor = tf.convert_to_tensor(final_user_input.values, dtype=tf.float32)
user_tensor[0]

<tf.Tensor: shape=(67,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
      dtype=float32)>

---------

In [51]:
target_index = y_encoded.columns.tolist() 

In [52]:
predict_proba = model_2.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step


KeyError: 'Allergy'