# Importing some basic liabraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Let us start with reading the dataset and take an overview.

In [4]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


#### We have 41 unique diseases and a dataset of 4920 entries in total 

In [6]:
dataset.describe()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
count,4920,4920,4920,4920,4572,3714,2934,2268,1944,1692,1512,1194,744,504,306,240,192,72
unique,41,34,48,54,50,38,32,26,21,22,21,18,11,8,4,3,3,1
top,Migraine,vomiting,vomiting,fatigue,high_fever,headache,nausea,abdominal_pain,abdominal_pain,yellowing_of_eyes,yellowing_of_eyes,irritability,malaise,runny_nose,chest_pain,chest_pain,blood_in_sputum,muscle_pain
freq,120,822,870,726,378,348,390,264,276,228,198,120,126,72,96,144,72,72


### It is possible that there are some white spaces present in the symptom names

In [14]:
cols = dataset.columns
data = dataset[cols].values.flatten()

symptoms = pd.Series(data)
symptoms = symptoms.str.strip()
symptoms = symptoms.values.reshape(dataset.shape)

dataset = pd.DataFrame(symptoms, columns = cols)
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


### We can see from the previous cell that there are a lot of NaN values. These need to be handled.

In [15]:
dataset.isna().sum()

Disease          0
Symptom_1        0
Symptom_2        0
Symptom_3        0
Symptom_4      348
Symptom_5     1206
Symptom_6     1986
Symptom_7     2652
Symptom_8     2976
Symptom_9     3228
Symptom_10    3408
Symptom_11    3726
Symptom_12    4176
Symptom_13    4416
Symptom_14    4614
Symptom_15    4680
Symptom_16    4728
Symptom_17    4848
dtype: int64

In [16]:
dataset = dataset.fillna(0)
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,itching,skin_rash,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### The symptoms need to be represented by a numerical value. 
#### For this we will use the severity of the symptom. Each of the symptoms will be replaced by a numerical severity value given in the dataset.

In [17]:
severity = pd.read_csv('Symptom-severity.csv')
severity.head()

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5


In [18]:
values = dataset.values
symptoms = severity['Symptom'].unique()

for symptom in symptoms:
    values[values==symptom] = severity[severity['Symptom'] == symptom]['weight'].values[0]
    
dataset = pd.DataFrame(values, columns=cols)
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,dischromic _patches,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### We can see that some of the symptoms did not exist in the severity table. 
#### These symptom occurences need to be set to 0.

In [19]:
dataset = dataset.replace('dischromic _patches', 0)
dataset = dataset.replace('spotting_ urination',0)
dataset = dataset.replace('foul_smell_of urine',0)
dataset.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Splitting the data and lables from the dataset

In [20]:
data = dataset.iloc[:,1:].values
labels = dataset['Disease'].values

### Splitting the data into train and test using train_test_split

In [22]:
x_train, x_test, y_train, y_test = train_test_split(data, 
                                                    labels, 
                                                    shuffle=True, 
                                                    train_size = 0.85)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(4182, 17) (738, 17) (4182,) (738,)


# Using Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [25]:
rf_clf = RandomForestClassifier(criterion='entropy')
rf_clf.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy')

In [26]:
y_predict = rf_clf.predict(x_test)
accuracy_score(y_test, y_predict)

0.9959349593495935

# Using KNN

In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [31]:
y_predict = knn.predict(x_test)
accuracy_score(y_test, y_predict)

0.9959349593495935

## Both the approaches produce the same accuracy. 
### Let us produce some more comparison statistics, for eg.  precision and recall

In [33]:
from sklearn.metrics import classification_report, confusion_matrix


predictions = {'RFC':rf_clf.predict(x_test), "KNN": knn.predict(x_test)}

for classifier, pred in predictions.items():
    cnf_matrix = confusion_matrix(y_test, pred)
    print(classifier,'Confusion matrix: \n',cnf_matrix)
    print(classifier, 'Classification report: \n',classification_report(y_test,pred))

RFC Confusion matrix: 
 [[12  0  0 ...  0  0  0]
 [ 0 24  0 ...  0  0  0]
 [ 0  0 13 ...  0  0  0]
 ...
 [ 0  0  0 ... 17  0  0]
 [ 0  0  0 ...  0 17  0]
 [ 0  0  0 ...  0  0 22]]
------------------
RFC Classification report: 
                                          precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        12
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        13
                    Alcoholic hepatitis       1.00      1.00      1.00        22
                                Allergy       1.00      1.00      1.00        17
                              Arthritis       1.00      1.00      1.00        15
                       Bronchial Asthma       1.00      1.00      1.00        19
                   Cervical spondylosis       1.00      0.79      0.88        14
                            Chicken pox   

## Both the approaches yeild the same results. Following could be the reasons for this performance :
### 1. There is no noise in the data as the dataset has been hand-picked by the creator.
### 2. The distribution of the symptoms is normal. There is no skewness in the data.
### 3. The disease classes are balanced, making it easy to fit the training data.

# Let us actually see the results now.
## Enter the symptoms, given in the dropdown, in the list 'current_symptoms'. 
## Running the following cells will then predict the disease using both, Random Forest and KNN.

In [37]:
def display_symptom(sympt):
    print(sympt)

In [39]:
import ipywidgets as widgets

unique_symptoms = severity['Symptom'].unique()
widgets.interact(display_symptom, sympt=unique_symptoms)

interactive(children=(Dropdown(description='sympt', options=('itching', 'skin_rash', 'nodal_skin_eruptions', '…

<function __main__.display_symptom>

In the below list, 'current_symptoms', add the symptoms that Sara is experiencing. The symptoms cam be anyone from the displayed list above.

In [69]:
current_symptoms = ['headache', 'high_fever']

In [70]:
model_input = np.zeros((1, 17), dtype=np.int64)

for i, c_symptom in enumerate(current_symptoms):
    try:
        model_input[0][i] = severity[severity['Symptom'] == c_symptom]['weight'].values[0]
    except:
        print('Please enter the symtoms from the above dropdown only.')

In [71]:
RF_pred = rf_clf.predict(model_input)
KNN_pred = knn.predict(model_input)
print('Prediction using Random Forest : ', RF_pred)
print('Prediction using KNN : ', KNN_pred)

['Fungal infection']
['Fungal infection']


### Change the symptoms and re-run the above cells.