<a href="https://colab.research.google.com/github/Pankhuri-279/medical-assistant/blob/main/medical_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [47]:
import regex as re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loading and Viewing the data

In [48]:
main_data = pd.read_csv("https://raw.githubusercontent.com/Pankhuri-279/medical-assistant/main/dataset.csv")

In [49]:
main_data.head(10)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
5,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
6,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
7,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
8,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,
9,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,


In [50]:
main_data.sample(5)

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
2527,Common Cold,continuous_sneezing,chills,fatigue,cough,high_fever,headache,swelled_lymph_nodes,malaise,phlegm,throat_irritation,redness_of_eyes,sinus_pressure,runny_nose,congestion,chest_pain,loss_of_smell,muscle_pain
1005,Typhoid,chills,vomiting,fatigue,high_fever,headache,nausea,constipation,abdominal_pain,diarrhoea,belly_pain,,,,,,,
2246,hepatitis A,joint_pain,vomiting,yellowish_skin,dark_urine,loss_of_appetite,abdominal_pain,diarrhoea,mild_fever,yellowing_of_eyes,muscle_pain,,,,,,,
4839,Allergy,continuous_sneezing,shivering,chills,watering_from_eyes,,,,,,,,,,,,,
4026,Gastroenteritis,vomiting,sunken_eyes,dehydration,diarrhoea,,,,,,,,,,,,,


In [51]:
main_data.shape

(4920, 18)

# Label Encoding

#### We are going to label encode the Disease column first, then the rest

In [52]:
df = main_data.copy() # We take a copy of the original data incase we needed the original data later
df.dropna(axis=1, how='all', inplace=True) # Dropping rows which are all NaN
df.fillna(0, inplace=True)                 # Replacing the NaN with 0

# Creating a custom label encoder so we can specify which number the encoding starts from
class CustomLabelEncoder(LabelEncoder):
    def __init__(self, start=0):
        self.start = start
        super().__init__()

    def fit_transform(self, y):
        encoded = super().fit_transform(y)
        encoded += self.start
        return encoded

# Flatten the 'Disease' column into a single Series
flattened_series = df['Disease'].astype(str)

# Create and fit label encoder for the 'Disease' column
encoder = CustomLabelEncoder(start=200) # Here we tell the label encoder to start encoding from 200

*Why?* you might ask
Because if we just imported and fitted the usual label encoder, it will start indexing from 0.
*So?*
In the next step, we will label encoding the **rest** of the data, and that encoder will start from 0 to 131.
So we are trying to prevent different values from getting encoding the same way.

*BUT WHY ARE WE DOING THEM SEPARATLY?!* you might ask.
When I first wrote the code I thought this way would be easier than just encoding
the entire dataset, then separate the features from the targets in the label_mapping dictionary.

If you find this was complicated or impractical, that's okay, just label_encode the entire data then seperate the features from the labels. The end result will be the same: converting string into int

In [53]:
encoded_values = encoder.fit_transform(flattened_series)
df['Disease'] = encoded_values

mapping_data = {'label_encoder': encoder}

# Saving the mapping of the label column "Disease" to use later
label_mapping = {k: v for k, v in zip(mapping_data['label_encoder'].classes_, range(200, 200+len(mapping_data['label_encoder'].classes_)))}

df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,215,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,215,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,215,itching,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,215,itching,skin_rash,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,215,itching,skin_rash,nodal_skin_eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
label_mapping

{'(vertigo) Paroymsal  Positional Vertigo': 200,
 'AIDS': 201,
 'Acne': 202,
 'Alcoholic hepatitis': 203,
 'Allergy': 204,
 'Arthritis': 205,
 'Bronchial Asthma': 206,
 'Cervical spondylosis': 207,
 'Chicken pox': 208,
 'Chronic cholestasis': 209,
 'Common Cold': 210,
 'Dengue': 211,
 'Diabetes ': 212,
 'Dimorphic hemmorhoids(piles)': 213,
 'Drug Reaction': 214,
 'Fungal infection': 215,
 'GERD': 216,
 'Gastroenteritis': 217,
 'Heart attack': 218,
 'Hepatitis B': 219,
 'Hepatitis C': 220,
 'Hepatitis D': 221,
 'Hepatitis E': 222,
 'Hypertension ': 223,
 'Hyperthyroidism': 224,
 'Hypoglycemia': 225,
 'Hypothyroidism': 226,
 'Impetigo': 227,
 'Jaundice': 228,
 'Malaria': 229,
 'Migraine': 230,
 'Osteoarthristis': 231,
 'Paralysis (brain hemorrhage)': 232,
 'Peptic ulcer diseae': 233,
 'Pneumonia': 234,
 'Psoriasis': 235,
 'Tuberculosis': 236,
 'Typhoid': 237,
 'Urinary tract infection': 238,
 'Varicose veins': 239,
 'hepatitis A': 240}

#### Now we are going to use the label encoder to encode the rest of the data

In [55]:
# Stack the entire data into a single Series.
# We are stacking the entire data because there're similar values in different columns. **REMEMBER THIS**
encode_df = df.copy() # Again, taking a copy because we might need the original later.
encode_df = encode_df.drop(["Disease"], axis = 1)
flattened_series = encode_df.stack().astype(str)

# Create and fit label encoder.
encoder = LabelEncoder()
encoded_values = encoder.fit_transform(flattened_series)

# Reshape the encoded values back to the original DataFrame shape.
F_encoded_df = pd.DataFrame(encoded_values.reshape(encode_df.shape), columns=encode_df.columns,
                            index=encode_df.index)

# Store the mapping data for future use
Fmapping_data = {'label_encoder': encoder}
feature_mapping = {k: v for k, v in zip(Fmapping_data['label_encoder'].classes_,
                                        Fmapping_data['label_encoder'].\
                                        transform(Fmapping_data['label_encoder'].classes_))}
F_encoded_df.head(3)

Unnamed: 0,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,131,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130
1,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
2,131,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130


In [56]:
feature_mapping

{' abdominal_pain': 0,
 ' abnormal_menstruation': 1,
 ' acidity': 2,
 ' acute_liver_failure': 3,
 ' altered_sensorium': 4,
 ' anxiety': 5,
 ' back_pain': 6,
 ' belly_pain': 7,
 ' blackheads': 8,
 ' bladder_discomfort': 9,
 ' blister': 10,
 ' blood_in_sputum': 11,
 ' bloody_stool': 12,
 ' blurred_and_distorted_vision': 13,
 ' breathlessness': 14,
 ' brittle_nails': 15,
 ' bruising': 16,
 ' burning_micturition': 17,
 ' chest_pain': 18,
 ' chills': 19,
 ' cold_hands_and_feets': 20,
 ' coma': 21,
 ' congestion': 22,
 ' constipation': 23,
 ' continuous_feel_of_urine': 24,
 ' continuous_sneezing': 25,
 ' cough': 26,
 ' cramps': 27,
 ' dark_urine': 28,
 ' dehydration': 29,
 ' depression': 30,
 ' diarrhoea': 31,
 ' dischromic _patches': 32,
 ' distention_of_abdomen': 33,
 ' dizziness': 34,
 ' drying_and_tingling_lips': 35,
 ' enlarged_thyroid': 36,
 ' excessive_hunger': 37,
 ' extra_marital_contacts': 38,
 ' family_history': 39,
 ' fast_heart_rate': 40,
 ' fatigue': 41,
 ' fluid_overload': 42,

In [57]:
label_encoded_df = pd.concat([df['Disease'], F_encoded_df], axis = 1)
label_encoded_df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,215,131,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130
1,215,99,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
2,215,131,72,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
3,215,131,99,32,130,130,130,130,130,130,130,130,130,130,130,130,130,130
4,215,131,99,72,130,130,130,130,130,130,130,130,130,130,130,130,130,130


#### So now we have a dataset called **label_encoded_df** that has the same data as **main_data** dataset but label-encoded.
#### And we saved the mapping of the target column in a dict called *label_mapping*, and the mapping of the features in a dict called *feature_mapping*.

In [58]:
# Creating X and y
model_features = label_encoded_df.columns.tolist()
model_features.remove("Disease")
X = label_encoded_df[model_features]
y = label_encoded_df["Disease"]

In [59]:
# One_hot_encoding the y column to use it as a multicalss in the model output layer
y_encoded = pd.get_dummies(y)
y_encoded.shape

(4920, 41)

In [60]:
# The column names are the mapping of the target column. **REMEMBER THIS**
y_encoded.head()

Unnamed: 0,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [61]:
target_index = y_encoded.columns.tolist() # If you remember, the column names after the one-hot-encoding ARE the mapping of the target values.

# One Hot Encoding

In [62]:
df = main_data.copy() # As usual, taking a copy from that data incase we needed the original later
# Combine all symptom columns into a single column
df['All Symptoms'] = df.apply(lambda row: ','.join(row.dropna()), axis=1)
# Drop duplicate symptoms within each cell
df['All Symptoms'] = df['All Symptoms'].apply(lambda x: ','.join(sorted(set(x.split(','))) if x else ''))
stay_cols= ['Disease', 'All Symptoms']
df = df[stay_cols]
df.head()

Unnamed: 0,Disease,All Symptoms
0,Fungal infection,"dischromic _patches, nodal_skin_eruptions, sk..."
1,Fungal infection,"dischromic _patches, nodal_skin_eruptions, sk..."
2,Fungal infection,"dischromic _patches, nodal_skin_eruptions,Fun..."
3,Fungal infection,"dischromic _patches, skin_rash,Fungal infecti..."
4,Fungal infection,"nodal_skin_eruptions, skin_rash,Fungal infect..."


In [63]:
df['All Symptoms'][0]

' dischromic _patches, nodal_skin_eruptions, skin_rash,Fungal infection,itching'

### Removing underscores and cleaning data

In [64]:
def strip_to_basic_tokens(text):
    # Remove doble spaces and underscores
    text = re.sub(r'[_\s]+', ' ', text)
    # Split by commas and lowercase the tokens
    tokens = [token.strip().lower() for token in text.split(',')]
    return tokens

# Apply the function to 'All Symptoms' column
df['Basic Tokens'] = df['All Symptoms'].apply(strip_to_basic_tokens)
df['Basic Tokens'] = df['Basic Tokens'].apply(lambda x: ', '.join(x))
df = df.drop(['All Symptoms'], axis = 1)
df.head()

Unnamed: 0,Disease,Basic Tokens
0,Fungal infection,"dischromic patches, nodal skin eruptions, skin..."
1,Fungal infection,"dischromic patches, nodal skin eruptions, skin..."
2,Fungal infection,"dischromic patches, nodal skin eruptions, fung..."
3,Fungal infection,"dischromic patches, skin rash, fungal infectio..."
4,Fungal infection,"nodal skin eruptions, skin rash, fungal infect..."


In [65]:
df['Basic Tokens'][0]

'dischromic patches, nodal skin eruptions, skin rash, fungal infection, itching'

### One-hot-encoding using Multi-Label Binarizer

In [66]:
dfE = df.copy() # Taking a copy because we never know what might happen
dfE['Basic Tokens'] = dfE['Basic Tokens'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
# Fit and transform the 'Basic Tokens' column
one_hot_encoded = pd.DataFrame(mlb.fit_transform(dfE['Basic Tokens']), columns=mlb.classes_, index=df.index)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([dfE, one_hot_encoded], axis=1)

# Drop the 'Basic Tokens' column
df_encoded = df_encoded.drop(columns=['Basic Tokens'])
df_encoded.head()

Unnamed: 0,Disease,(vertigo) paroymsal positional vertigo,abdominal pain,abnormal menstruation,acidity,acne,acute liver failure,aids,alcoholic hepatitis,allergy,altered sensorium,anxiety,arthritis,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bronchial asthma,bruising,burning micturition,cervical spondylosis,chest pain,chicken pox,chills,chronic cholestasis,cold hands and feets,coma,common cold,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,dengue,depression,diabetes,diarrhoea,dimorphic hemmorhoids(piles),dischromic patches,distention of abdomen,dizziness,drug reaction,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,fungal infection,gastroenteritis,gerd,headache,heart attack,hepatitis a,hepatitis b,hepatitis c,hepatitis d,hepatitis e,high fever,hip joint pain,history of alcohol consumption,hypertension,hyperthyroidism,hypoglycemia,hypothyroidism,impetigo,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,jaundice,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,malaria,migraine,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,osteoarthristis,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,paralysis (brain hemorrhage),passage of gases,patches in throat,peptic ulcer diseae,phlegm,pneumonia,polyuria,prominent veins on calf,psoriasis,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),tuberculosis,typhoid,ulcers on tongue,unsteadiness,urinary tract infection,varicose veins,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [67]:
df_encoded.shape

(4920, 173)

### Now let's drop the diseases column values that got encoded in the column names:

In [68]:
disease_names = [key for key in label_mapping.keys()]
diseases = [strip_to_basic_tokens(disease) for disease in disease_names]
diseases_cleaned = [item[0] if isinstance(item, list) else item for item in diseases]
df_encoded = df_encoded.drop(diseases_cleaned, axis = 1)
df_encoded.shape

(4920, 132)

### Creating and compiling the model

In [69]:
model_features = df_encoded.columns.tolist()
model_features.remove("Disease")
X = df_encoded[model_features]
y = df_encoded["Disease"]

In [70]:
y_encoded = pd.get_dummies(y)
y_encoded.shape

(4920, 41)

In [71]:
y_encoded.head()

Unnamed: 0,(vertigo) Paroymsal Positional Vertigo,AIDS,Acne,Alcoholic hepatitis,Allergy,Arthritis,Bronchial Asthma,Cervical spondylosis,Chicken pox,Chronic cholestasis,Common Cold,Dengue,Diabetes,Dimorphic hemmorhoids(piles),Drug Reaction,Fungal infection,GERD,Gastroenteritis,Heart attack,Hepatitis B,Hepatitis C,Hepatitis D,Hepatitis E,Hypertension,Hyperthyroidism,Hypoglycemia,Hypothyroidism,Impetigo,Jaundice,Malaria,Migraine,Osteoarthristis,Paralysis (brain hemorrhage),Peptic ulcer diseae,Pneumonia,Psoriasis,Tuberculosis,Typhoid,Urinary tract infection,Varicose veins,hepatitis A
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.25, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [73]:
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
X_test_tensor = tf.convert_to_tensor(X_test.values, dtype=tf.float32)
X_eval_tensor = tf.convert_to_tensor(X_eval.values, dtype=tf.float32)
y_train_tensor = tf.convert_to_tensor(y_train, dtype=tf.float64)
y_test_tensor = tf.convert_to_tensor(y_test, dtype=tf.float64)
y_eval_tensor = tf.convert_to_tensor(y_eval, dtype=tf.float64)

In [74]:
X_train_tensor

<tf.Tensor: shape=(3690, 131), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 1., 1.],
       [1., 0., 0., ..., 0., 1., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [75]:
with tf.device('/GPU:0'):
    model_2 = keras.Sequential([
        layers.Input(shape=(X_train_tensor.shape[1],)),
        layers.Dense(160, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.2),
        layers.Dense(240, activation='tanh'),
        layers.BatchNormalization(),
        layers.Dense(240, activation='tanh'),
        layers.Dropout(0.2),
        layers.Dense(200, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(160, activation='relu'),
        layers.Dense(y_train_tensor.shape[1], activation='softmax')])

    model_2.compile(optimizer= 'adam', loss='categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, mode='max')
    history = model_2.fit(X_train_tensor, y_train_tensor, epochs=500, callbacks=[early_stopping],
                batch_size=16, validation_data=(X_eval_tensor, y_eval_tensor))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500


In [76]:
model_2.evaluate(X_test_tensor, y_test_tensor)



[1.0505184036446735e-05, 1.0]

### Manual Testing

In [77]:
# If you remember in the first model, we took a row from the origial data to test the model
# We aren't going to do this here, let's REALLY test it
user_input = ['stomach_pain','acidity','chest_pain'] # This should be GERD

original_data = df_encoded.copy()

# We will change the strip_to_basic_tokens function just a little bit to be able to deal with the user input
def strip_to_basic_tokens(symptoms):
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]
# Apply strip_to_basic_tokens function to user input
user_input_stripped = strip_to_basic_tokens(user_input)

# Initialize MultiLabelBinarizer with all symptoms
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

# Fit and transform user input
user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

# Concatenate user input with original data
final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'],axis = 1)
# Print the final user input shape
final_user_input.head()

Unnamed: 0,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bruising,burning micturition,chest pain,chills,cold hands and feets,coma,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,depression,diarrhoea,dischromic patches,distention of abdomen,dizziness,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,headache,high fever,hip joint pain,history of alcohol consumption,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,passage of gases,patches in throat,phlegm,polyuria,prominent veins on calf,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),ulcers on tongue,unsteadiness,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Converting it to tensor

In [78]:
user_tensor = tf.convert_to_tensor(final_user_input.values, dtype=tf.float32)
user_tensor[0]

<tf.Tensor: shape=(131,), dtype=float32, numpy=
array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

### After converting the user input to a tensor, we'll utilize the model to predict the disease the user may have:

In [79]:
predict_proba = model_2.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
prediction



'GERD'

### Let's test it again

In [80]:
user_input = ['continuous_sneezing','watering_from_eyes'] # This should be Allergy

original_data = df_encoded.copy()

# Apply strip_to_basic_tokens function to user input
user_input_stripped = strip_to_basic_tokens(user_input)

# Fit and transform user input
user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

# Concatenate user input with original data
final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'],axis = 1)
# Print the final user input shape
final_user_input.head()

Unnamed: 0,abdominal pain,abnormal menstruation,acidity,acute liver failure,altered sensorium,anxiety,back pain,belly pain,blackheads,bladder discomfort,blister,blood in sputum,bloody stool,blurred and distorted vision,breathlessness,brittle nails,bruising,burning micturition,chest pain,chills,cold hands and feets,coma,congestion,constipation,continuous feel of urine,continuous sneezing,cough,cramps,dark urine,dehydration,depression,diarrhoea,dischromic patches,distention of abdomen,dizziness,drying and tingling lips,enlarged thyroid,excessive hunger,extra marital contacts,family history,fast heart rate,fatigue,fluid overload,foul smell of urine,headache,high fever,hip joint pain,history of alcohol consumption,increased appetite,indigestion,inflammatory nails,internal itching,irregular sugar level,irritability,irritation in anus,itching,joint pain,knee pain,lack of concentration,lethargy,loss of appetite,loss of balance,loss of smell,malaise,mild fever,mood swings,movement stiffness,mucoid sputum,muscle pain,muscle wasting,muscle weakness,nausea,neck pain,nodal skin eruptions,obesity,pain behind the eyes,pain during bowel movements,pain in anal region,painful walking,palpitations,passage of gases,patches in throat,phlegm,polyuria,prominent veins on calf,puffy face and eyes,pus filled pimples,receiving blood transfusion,receiving unsterile injections,red sore around nose,red spots over body,redness of eyes,restlessness,runny nose,rusty sputum,scurring,shivering,silver like dusting,sinus pressure,skin peeling,skin rash,slurred speech,small dents in nails,spinning movements,spotting urination,stiff neck,stomach bleeding,stomach pain,sunken eyes,sweating,swelled lymph nodes,swelling joints,swelling of stomach,swollen blood vessels,swollen extremeties,swollen legs,throat irritation,toxic look (typhos),ulcers on tongue,unsteadiness,visual disturbances,vomiting,watering from eyes,weakness in limbs,weakness of one body side,weight gain,weight loss,yellow crust ooze,yellow urine,yellowing of eyes,yellowish skin
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [81]:
user_tensor = tf.convert_to_tensor(final_user_input.values, dtype=tf.float32)
user_tensor[0]

<tf.Tensor: shape=(131,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [82]:
predict_proba = model_2.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]
prediction



'Allergy'

# Doctor Recommendation

## Creating and Training the model

In [83]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Mock data setup
data = {
    "Doctor": [
        "Dr. Smith", "Dr. Johnson", "Dr. Williams", "Dr. Jones", "Dr. Brown",
        "Dr. Davis", "Dr. Miller", "Dr. Wilson", "Dr. Moore", "Dr. Taylor",
        "Dr. Anderson", "Dr. Thomas", "Dr. Jackson", "Dr. White", "Dr. Harris",
        "Dr. Martin", "Dr. Thompson", "Dr. Garcia", "Dr. Martinez", "Dr. Robinson"
    ],
    "Specialty": [
        "Cardiology", "Endocrinology", "Dermatology", "General Medicine", "Orthopedics",
        "Infectious Disease", "Gastroenterology", "Neurology", "Pulmonology", "Immunology",
        "Urology", "General Surgery", "Vascular Surgery", "Otolaryngology", "Psychiatry",
        "Pediatrics", "Oncology", "Nephrology", "Rheumatology", "Allergy and Immunology"
    ],
    "Schedule": [
        "Monday, Wednesday, Friday", "Tuesday, Thursday", "Monday, Thursday", "Wednesday, Friday", "Tuesday, Thursday, Saturday",
        "Monday, Tuesday, Wednesday", "Thursday, Friday, Saturday", "Tuesday, Wednesday", "Monday, Tuesday", "Friday, Saturday",
        "Wednesday, Thursday", "Monday, Wednesday, Saturday", "Tuesday, Thursday, Friday", "Monday, Friday", "Tuesday, Wednesday, Thursday",
        "Monday, Thursday, Saturday", "Wednesday, Friday, Saturday", "Tuesday, Thursday", "Monday, Wednesday", "Thursday, Friday"
    ],
    "Average_Rating": [4.5, 4.0, 3.5, 4.8, 4.2, 3.8, 4.6, 4.1, 3.9, 4.3,
                       4.7, 3.6, 4.4, 3.7, 5.0, 4.9, 3.4, 4.2, 3.8, 4.1]  # Hypothetical average patient satisfaction ratings
}


df = pd.DataFrame(data)

# Feature Engineering: Convert Schedule into binary availability per day
days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday","Sunday"]
for day in days_of_week:
    df[day] = df["Schedule"].apply(lambda x: 1 if day in x else 0)

# Preparing the dataset for machine learning
X = df[days_of_week]  # Use days of the week as features
y = df["Average_Rating"]  # Target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

# Mock mapping of diseases to specialties
disease_to_specialty = {
    "Drug Reaction": "General Medicine",
    "Malaria": "Infectious Disease",
    "Allergy": "Immunology",
    "Hypothyroidism": "Endocrinology",
    "Psoriasis": "Dermatology",
    "GERD": "Gastroenterology",
    "Chronic cholestasis": "Gastroenterology",
    "Hepatitis A": "Gastroenterology",
    "Osteoarthristis": "Orthopedics",
    "(vertigo) Paroymsal Positional Vertigo": "Otolaryngology",
    "Hypoglycemia": "Endocrinology",
    "Acne": "Dermatology",
    "Diabetes": "Endocrinology",
    "Impetigo": "Dermatology",
    "Hypertension": "Cardiology",
    "Peptic ulcer diseae": "Gastroenterology",
    "Dimorphic hemorrhoids(piles)": "General Surgery",
    "Common Cold": "General Medicine",
    "Chicken pox": "Infectious Disease",
    "Cervical spondylosis": "Orthopedics",
    "Hyperthyroidism": "Endocrinology",
    "Urinary tract infection": "Urology",
    "Varicose veins": "Vascular Surgery",
    "AIDS": "Infectious Disease",
    "Paralysis (brain hemorrhage)": "Neurology",
    "Typhoid": "Infectious Disease",
    "Hepatitis B": "Gastroenterology",
    "Fungal infection": "Dermatology",
    "Hepatitis C": "Gastroenterology",
    "Migraine": "Neurology",
    "Bronchial Asthma": "Pulmonology",
    "Alcoholic hepatitis": "Gastroenterology",
    "Jaundice": "Gastroenterology",
    "Hepatitis E": "Gastroenterology",
    "Dengue": "Infectious Disease",
    "Hepatitis D": "Gastroenterology",
    "Heart attack": "Cardiology",
    "Pneumonia": "Pulmonology",
    "Arthritis": "Orthopedics",
    "Gastroenteritis": "Gastroenterology",
    "Tuberculosis": "Pulmonology"
    # Add more mappings as necessary
}

def recommend_doctors(disease, day=None):
    specialty_needed = disease_to_specialty.get(disease, "General Medicine")
    # Filter by specialty first
    available_doctors = df[df['Specialty'] == specialty_needed]

    # If a day is specified, further filter by the day
    if day and day in days_of_week:
        available_doctors = available_doctors[available_doctors[day] == 1]

    # If no doctors are found after filtering by day or specialty, return a message
    if available_doctors.empty:
        return f"No doctors specializing in {specialty_needed} are available{f' on {day}' if day else ''}."

    # If doctors are available, proceed with predicting ratings
    available_doctors = available_doctors.copy()  # To avoid SettingWithCopyWarning
    available_doctors['Predicted_Rating'] = model.predict(available_doctors[days_of_week])

    # Now that we have predicted ratings, we can sort by them and include the specialty in the output
    available_doctors = available_doctors.sort_values(by='Predicted_Rating', ascending=False)
    return available_doctors[['Doctor', 'Specialty', 'Predicted_Rating']]


Mean Squared Error: 0.5194272499999991


## Testing with User Input

In [84]:
#TESTING provide symptons based on the data

user_input = ['stomach_pain','acidity','chest_pain'] # This should be GERD disease
# Can be None or any day of the week
# day_input = "Monday"
day_input = None


original_data = df_encoded.copy()

# We will change the strip_to_basic_tokens function just a little bit to be able to deal with the user input
def strip_to_basic_tokens(symptoms):
    symptoms = [symptom.strip().lower().replace(' ', '_').replace('_', ' ') for symptom in symptoms]
    return [re.sub(r'\s+', ' ', symptom) for symptom in symptoms]
# Apply strip_to_basic_tokens function to user input
user_input_stripped = strip_to_basic_tokens(user_input)

# Initialize MultiLabelBinarizer with all symptoms
mlb = MultiLabelBinarizer(classes=df_encoded.columns)

# Fit and transform user input
user_input_encoded = pd.DataFrame(mlb.fit_transform([user_input_stripped]), columns=mlb.classes_)

# Concatenate user input with original data
final_user_input = pd.concat([pd.DataFrame(columns=original_data.columns), user_input_encoded], axis=0)
final_user_input = final_user_input.drop(['Disease'],axis = 1)

predict_proba = model_2.predict(user_tensor)
predicted_class_index = np.argmax(predict_proba)
prediction_encode = target_index[predicted_class_index]
inverse_label_encoding = {v: k for k, v in label_mapping.items()}
prediction = inverse_label_encoding[prediction_encode]


disease_input = prediction

# Disease description
symptom_description_df = pd.read_csv('https://raw.githubusercontent.com/Pankhuri-279/medical-assistant/main/symptom_Description.csv')
disease_description = symptom_description_df.loc[symptom_description_df['Disease'] == disease_input, 'Description'].values[0]
print(f"Diagnosed disease is {disease_input}: {disease_description}")

# Call the function and print the recommended doctors
recommended_doctors = recommend_doctors(disease_input, day_input)
print(f"Doctors recommended for {disease_input}{f' on {day_input}' if day_input else ''}:\n{recommended_doctors}")

Diagnosed disease is Allergy: An allergy is an immune system response to a foreign substance that's not typically harmful to your body.They can include certain foods, pollen, or pet dander. Your immune system's job is to keep you healthy by fighting harmful pathogens.
Doctors recommended for Allergy:
       Doctor   Specialty  Predicted_Rating
9  Dr. Taylor  Immunology             4.184
