### Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('future.no_silent_downcasting', True)
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, ConfusionMatrixDisplay ,classification_report,RocCurveDisplay,precision_score,roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.model_selection import cross_val_score, StratifiedKFold

### Importing Dataset

In [2]:
disease_dataframe = pd.read_csv('Datasets/Disease_Dataset.csv')
description_dataframe = pd.read_csv('Datasets/symptom_Description.csv')
Precaution_dataframe = pd.read_csv('Datasets/symptom_precaution.csv')
Severity_dataframe = pd.read_csv('Datasets/Symptom-severity.csv')
Severity_dataframe.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [3]:
disease_dataframe.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Fungal infection,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,muscle_pain,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


### Data Cleaning

In [4]:
for columns in disease_dataframe:
    disease_dataframe[columns] = disease_dataframe[columns].str.replace('_',' ')

print("Cleaned Data is ")
disease_dataframe

Cleaned Data is 


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning movements,loss of balance,unsteadiness,,,,,,,,,,,
4916,Acne,skin rash,pus filled pimples,blackheads,scurring,,,,,,,,,,,,,
4917,Urinary tract infection,burning micturition,bladder discomfort,foul smell of urine,continuous feel of urine,,,,,,,,,,,,,
4918,Psoriasis,skin rash,joint pain,skin peeling,silver like dusting,small dents in nails,inflammatory nails,,,,,,,,,,,


In [5]:
cols = disease_dataframe.columns
data = disease_dataframe[cols].values.flatten()

s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(disease_dataframe.shape)

disease_dataframe = pd.DataFrame(s, columns=disease_dataframe.columns)
disease_dataframe.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin rash,dischromic patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin rash,nodal skin eruptions,,,,,,,,,,,,,,


In [6]:
Severity_dataframe['Symptom'] = Severity_dataframe['Symptom'].str.replace('_', ' ')

print("Cleaned Data is ")
Severity_dataframe


Cleaned Data is 


Unnamed: 0,Symptom,weight
0,itching,1
1,skin rash,3
2,nodal skin eruptions,4
3,continuous sneezing,4
4,shivering,5
...,...,...
128,inflammatory nails,2
129,blister,4
130,red sore around nose,2
131,yellow crust ooze,3


#### Handling Missing Data

In [7]:
print("Before Handling Missing Data:")
print("disease_dataframe:")
print(disease_dataframe.isnull().sum())
print("description_dataframe:")
print(description_dataframe.isnull().sum())
print("Precaution_dataframe:")
print(Precaution_dataframe.isnull().sum())
print("Severity_dataframe:")
print(Severity_dataframe.isnull().sum())

Before Handling Missing Data:
disease_dataframe:
Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64
description_dataframe:
Disease        0
Description    0
dtype: int64
Precaution_dataframe:
Disease         0
Precaution_1    0
Precaution_2    0
Precaution_3    1
Precaution_4    1
dtype: int64
Severity_dataframe:
Symptom    0
weight     0
dtype: int64


In [8]:
disease_dataframe = disease_dataframe.fillna(0)
disease_dataframe

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin rash,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal skin eruptions,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin rash,dischromic patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin rash,nodal skin eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,vomiting,headache,nausea,spinning movements,loss of balance,unsteadiness,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,skin rash,pus filled pimples,blackheads,scurring,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,burning micturition,bladder discomfort,foul smell of urine,continuous feel of urine,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,skin rash,joint pain,skin peeling,silver like dusting,small dents in nails,inflammatory nails,0,0,0,0,0,0,0,0,0,0,0


In [9]:
Precaution_dataframe = Precaution_dataframe.fillna("rest")
Precaution_dataframe

Unnamed: 0,Disease,Precaution_1,Precaution_2,Precaution_3,Precaution_4
0,Drug Reaction,stop irritation,consult nearest hospital,stop taking drug,follow up
1,Malaria,Consult nearest hospital,avoid oily food,avoid non veg food,keep mosquitos out
2,Allergy,apply calamine,cover area with bandage,rest,use ice to compress itching
3,Hypothyroidism,reduce stress,exercise,eat healthy,get proper sleep
4,Psoriasis,wash hands with warm soapy water,stop bleeding using pressure,consult doctor,salt baths
5,GERD,avoid fatty spicy food,avoid lying down after eating,maintain healthy weight,exercise
6,Chronic cholestasis,cold baths,anti itch medicine,consult doctor,eat healthy
7,hepatitis A,Consult nearest hospital,wash hands through,avoid fatty spicy food,medication
8,Osteoarthristis,acetaminophen,consult nearest hospital,follow up,salt baths
9,(vertigo) Paroymsal Positional Vertigo,lie down,avoid sudden change in body,avoid abrupt head movment,relax


In [10]:
print("After Handling Missing Data: ")
print("disease_dataframe:")
print(disease_dataframe.isnull().sum())
print("description_dataframe:")
print(description_dataframe.isnull().sum())
print("Precaution_dataframe:")
print(Precaution_dataframe.isnull().sum())
print("Severity_dataframe:")
print(Severity_dataframe.isnull().sum())

After Handling Missing Data: 
disease_dataframe:
Disease       0
Symptom_1     0
Symptom_2     0
Symptom_3     0
Symptom_4     0
Symptom_5     0
Symptom_6     0
Symptom_7     0
Symptom_8     0
Symptom_9     0
Symptom_10    0
Symptom_11    0
Symptom_12    0
Symptom_13    0
Symptom_14    0
Symptom_15    0
Symptom_16    0
Symptom_17    0
dtype: int64
description_dataframe:
Disease        0
Description    0
dtype: int64
Precaution_dataframe:
Disease         0
Precaution_1    0
Precaution_2    0
Precaution_3    0
Precaution_4    0
dtype: int64
Severity_dataframe:
Symptom    0
weight     0
dtype: int64


### Insights

In [11]:
Total_Symptoms = Severity_dataframe['Symptom'].nunique()
print("Number of Total symptoms are ", Total_Symptoms)

Number of Total symptoms are  132


In [12]:
symptoms = Severity_dataframe['Symptom'].unique()
print("All the Symptoms are ",symptoms)

All the Symptoms are  ['itching' 'skin rash' 'nodal skin eruptions' 'continuous sneezing'
 'shivering' 'chills' 'joint pain' 'stomach pain' 'acidity'
 'ulcers on tongue' 'muscle wasting' 'vomiting' 'burning micturition'
 'spotting urination' 'fatigue' 'weight gain' 'anxiety'
 'cold hands and feets' 'mood swings' 'weight loss' 'restlessness'
 'lethargy' 'patches in throat' 'irregular sugar level' 'cough'
 'high fever' 'sunken eyes' 'breathlessness' 'sweating' 'dehydration'
 'indigestion' 'headache' 'yellowish skin' 'dark urine' 'nausea'
 'loss of appetite' 'pain behind the eyes' 'back pain' 'constipation'
 'abdominal pain' 'diarrhoea' 'mild fever' 'yellow urine'
 'yellowing of eyes' 'acute liver failure' 'fluid overload'
 'swelling of stomach' 'swelled lymph nodes' 'malaise'
 'blurred and distorted vision' 'phlegm' 'throat irritation'
 'redness of eyes' 'sinus pressure' 'runny nose' 'congestion' 'chest pain'
 'weakness in limbs' 'fast heart rate' 'pain during bowel movements'
 'pain in 

In [13]:
Total_disease = disease_dataframe['Disease'].nunique()
print("Number of Total Disease are ", Total_disease)

Number of Total Disease are  41


In [14]:
disease = disease_dataframe['Disease'].unique()
print("All the disease are ",disease)

All the disease are  ['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction'
 'Peptic ulcer diseae' 'AIDS' 'Diabetes' 'Gastroenteritis'
 'Bronchial Asthma' 'Hypertension' 'Migraine' 'Cervical spondylosis'
 'Paralysis (brain hemorrhage)' 'Jaundice' 'Malaria' 'Chicken pox'
 'Dengue' 'Typhoid' 'hepatitis A' 'Hepatitis B' 'Hepatitis C'
 'Hepatitis D' 'Hepatitis E' 'Alcoholic hepatitis' 'Tuberculosis'
 'Common Cold' 'Pneumonia' 'Dimorphic hemmorhoids(piles)' 'Heart attack'
 'Varicose veins' 'Hypothyroidism' 'Hyperthyroidism' 'Hypoglycemia'
 'Osteoarthristis' 'Arthritis' '(vertigo) Paroymsal  Positional Vertigo'
 'Acne' 'Urinary tract infection' 'Psoriasis' 'Impetigo']


### Feature Engineering

In [15]:

data = disease_dataframe.values
symptoms = Severity_dataframe['Symptom'].unique()

for i in range(len(symptoms)):
    data[data == symptoms[i]] = Severity_dataframe[Severity_dataframe['Symptom'] == symptoms[i]]['weight'].values[0]
    
Data = pd.DataFrame(data, columns=disease_dataframe.columns)
Data = Data.replace({'dischromic  patches': 0, 'spotting  urination': 0, 'foul smell of urine': 0}).infer_objects(copy=False)
df=Data
Data

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,6,4,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,3,3,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0


### Data Spliting

In [16]:
Disease = Data['Disease'].values
Data = Data.drop('Disease',axis=1)

In [17]:
Disease, Data

(array(['Fungal infection', 'Fungal infection', 'Fungal infection', ...,
        'Urinary tract infection', 'Psoriasis', 'Impetigo'], dtype=object),
       Symptom_1  Symptom_2  Symptom_3  Symptom_4  Symptom_5  Symptom_6  \
 0             1          3          4          0          0          0   
 1             3          4          0          0          0          0   
 2             1          4          0          0          0          0   
 3             1          3          0          0          0          0   
 4             1          3          4          0          0          0   
 ...         ...        ...        ...        ...        ...        ...   
 4915          5          3          5          6          4          4   
 4916          3          2          2          2          0          0   
 4917          6          4          0          6          0          0   
 4918          3          3          3          2          2          2   
 4919          3          

In [18]:
# Spliting the Data in 80:20
Data_train, Data_test, Disease_train , Disease_test = train_test_split(Data, Disease, test_size = 0.20, stratify= Disease, random_state = 10)

### Model Training

In [19]:
DecisionTree =DecisionTreeClassifier(criterion='gini', splitter='random',random_state=20,max_depth=14)
DecisionTree.fit(Data_train.values,Disease_train)

### Model Evaluation

In [20]:
prediciton = DecisionTree.predict(Data_test)
test_data_accuracy = (accuracy_score(Disease_test, prediciton))*100
print("Accuracy of Model is ",test_data_accuracy)

Accuracy of Model is  98.47560975609755




In [21]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(DecisionTree, Data_train, Disease_train, cv=kfold)
print("Average Accuracy using Kfold is ", (np.mean(scores))*100)

Average Accuracy using Kfold is  99.05994620708339


### Prediciton Using Model

In [22]:
testing_prediciton_data = np.array([0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).reshape(1, -1)
prediciton = DecisionTree.predict(testing_prediciton_data)
print(prediciton)

['Fungal infection']


In [27]:
def prediction(User_Symptoms : list):
    
    print(User_Symptoms)
    Symptoms = np.array(Severity_dataframe["Symptom"])
    weight = np.array(Severity_dataframe["weight"])

    for data in range(len(User_Symptoms)):
        for symp in range(len(Symptoms)):
            if User_Symptoms[data] == Symptoms[symp]:
                User_Symptoms[data] = weight[symp]

    
    filled_0 = 17 - len(User_Symptoms)
    for i in range(filled_0):
        User_Symptoms.append(0)

    User_Symptoms = np.array(User_Symptoms).reshape(1, -1)
    print(User_Symptoms)
    prediciton = DecisionTree.predict(User_Symptoms)
    prediciton = prediciton.tolist()
    description = description_dataframe[description_dataframe['Disease'] == prediciton[0]].values[0][1]
    Precautions = Precaution_dataframe[Precaution_dataframe['Disease'] == prediciton[0]].drop('Disease',axis=1).values.tolist()[0]

        
    print(type(prediciton))
    print(description)
    print(Precautions)
    
	
prediction(User_Symptoms = ["fatigue","mood swings","diarrhoea"])

['fatigue', 'mood swings', 'diarrhoea']
[[4 3 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
<class 'list'>
The death of heart muscle due to the loss of blood supply. The loss of blood supply is usually caused by a complete blockage of a coronary artery, one of the arteries that supplies blood to the heart muscle.
['call ambulance', 'chew or swallow asprin', 'keep calm', 'rest']


### Extracting the Model

In [24]:
pickle.dump(DecisionTree, open('ML Models/Disease_DecisionTree_Model.sav','wb'))