#Disease Prediction from Symptoms

The dataset source: http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html

##Importing all needed libraries

In [8]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
import collections
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Transforming & Loading the Data

### Loading the Dataset File generated after preprocessing in excel

In [9]:
disease_list = []

def return_list(disease):
    disease_list = []
    match = disease.replace('^','_').split('_')
    i = 0
    # print(match)
    for group in match:
        # print(group)
        if i%2!=0:
            disease_list.append(group)
        i = i + 1

    return disease_list

with open("Datasets/raw_data.csv") as csvfile:
    reader = csv.reader(csvfile)
    disease=""
    weight = 0
    disease_list = []
    dict_wt = {}
    dict_=collections.defaultdict(list)
    
    for row in reader:
        # print(row)
        if row[0]!="\xc2\0xa0" and row[0]!="": # for handling file encoding errors
          # saving disease and frequency
            disease = row[0]
            disease_list = return_list(disease)
            weight = row[1]

        if row[2]!="\xc2\0xa0" and row[2]!="":
            symptom_list = return_list(row[2])
            # print(row[2])
            for d in disease_list:
                for s in symptom_list:
                    # print(s)
                    dict_[d].append(s) # adding all symptoms
                dict_wt[d] = weight
                # print(dict_wt[d])


### Reformatting the data

In [10]:
# saving cleaned data
with open("dataset_clean.csv","w") as csvfile:
    writer = csv.writer(csvfile)
    for key,values in dict_.items():
        # print(key, " ", values)
        # print().
        #v denotes symptoms
        for v in values:
            # print(key)
            # key = str.encode(key).decode('utf-8')
            writer.writerow([key,v,dict_wt[key]])

In [11]:
columns = ['Source','Target','Weight'] # source: disease, target: symptom, weight: number of cases

In [12]:
data = pd.read_csv("dataset_clean.csv",names=columns, encoding ="ISO-8859-1")

In [13]:
data.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363
1,hypertensive disease,shortness of breath,3363
2,hypertensive disease,dizziness,3363
3,hypertensive disease,asthenia,3363
4,hypertensive disease,fall,3363


In [14]:
data.to_csv("dataset_clean.csv",index=False)

In [15]:
data = pd.read_csv("dataset_clean.csv", encoding ="ISO-8859-1")

In [16]:
data.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363
1,hypertensive disease,shortness of breath,3363
2,hypertensive disease,dizziness,3363
3,hypertensive disease,asthenia,3363
4,hypertensive disease,fall,3363


In [17]:
len(data['Source'].unique()) # unique diseases

149

In [18]:
len(data['Target'].unique()) # unique symptoms

405

In [19]:
df = pd.DataFrame(data)

In [20]:
df_1 = pd.get_dummies(df.Target) # 1 hot encoding of 
# symptoms

In [21]:
df_1.head()

Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363
1,hypertensive disease,shortness of breath,3363
2,hypertensive disease,dizziness,3363
3,hypertensive disease,asthenia,3363
4,hypertensive disease,fall,3363


In [23]:
df_s = df['Source']
#target means SYMPTOMS and Source means Disease

In [24]:
df_pivoted = pd.concat([df_s,df_1], axis=1)

In [25]:
df_pivoted.drop_duplicates(keep='first',inplace=True)

In [26]:
df_pivoted.head()

Unnamed: 0,Source,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,hypertensive disease,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
len(df_pivoted)

2116

In [28]:
df_pivoted.to_csv("df_pivoted.csv")

In [29]:
data = pd.read_csv('Datasets/Training.csv')

In [30]:
data.isnull().sum()
data.dropna(inplace = True)

In [31]:
cols = data.columns
cols = cols[:-1]
cols

Index(['itching', 'skin_rash', 'nodal_skin_eruptions', 'continuous_sneezing',
       'shivering', 'chills', 'joint_pain', 'stomach_pain', 'acidity',
       'ulcers_on_tongue',
       ...
       'pus_filled_pimples', 'blackheads', 'scurring', 'skin_peeling',
       'silver_like_dusting', 'small_dents_in_nails', 'inflammatory_nails',
       'blister', 'red_sore_around_nose', 'yellow_crust_ooze'],
      dtype='object', length=132)

In [32]:
x = data[cols]
y = data['prognosis']

In [33]:
ar = ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS' ,'Acne',
 'Alcoholic hepatitis' ,'Allergy', 'Arthritis', 'Bronchial Asthma',
 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold',
 'Dengue', 'Diabetes ' ,'Dimorphic hemmorhoids(piles)' ,'Drug Reaction',
 'Fungal infection' ,'GERD', 'Gastroenteritis' ,'Heart attack', 'Hepatitis B',
 'Hepatitis C', 'Hepatitis D' ,'Hepatitis E' ,'Hypertension ',
 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice',
 'Malaria', 'Migraine' ,'Osteoarthristis' ,'Paralysis (brain hemorrhage)',
 'Peptic ulcer diseae' ,'Pneumonia' ,'Psoriasis' ,'Tuberculosis', 'Typhoid',
 'Urinary tract infection', 'Varicose veins' ,'hepatitis A']

for i in range(0,41):
    for col in y:
        if col == ar[i]:
            data.loc[data['prognosis'] == col,'prognosis'] = i
            data.to_csv('Datasets/Training.csv',index=False)

# print(x)

## Building Classifier: Using Multinomial Naive Bayes

In [34]:
# importing libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [35]:
# performing train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [36]:
# Training multinomial naive bayes
mnb = MultinomialNB()
mnb = mnb.fit(x_train, y_train)

In [37]:
mnb.score(x_test, y_test)

0.9981527093596059

In [38]:
mnb_tot = MultinomialNB()
mnb_tot = mnb_tot.fit(x, y)

In [39]:
mnb_tot.score(x, y)

0.999390243902439

In [40]:
disease_pred = mnb_tot.predict(x)
print(len(disease_pred))

4920


In [41]:
disease_real = y.values
print(len(disease_real))

4920


In [42]:
# printing model error
print(len(y))
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0} Actual:{1}'.format(disease_pred[i], disease_real[i]))

4920
Pred: 21.0 Actual:9.0
Pred: 21.0 Actual:9.0
Pred: 21.0 Actual:9.0


In [44]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(disease_real, disease_pred)  
cm

array([[120,   0,   0, ...,   0,   0,   0],
       [  0, 120,   0, ...,   0,   0,   0],
       [  0,   0, 120, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ..., 120,   0,   0],
       [  0,   0,   0, ...,   0, 120,   0],
       [  0,   0,   0, ...,   0,   0, 120]], dtype=int64)

## Building Classifier: Using Decision  Tree

In [74]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [76]:
print ("DecisionTree")
dt = DecisionTreeClassifier(min_samples_split=20)
clf_dt=dt.fit(x_train,y_train)
print ("Acurracy: ", clf_dt.score(x_train,y_train))

DecisionTree
Acurracy:  0.9899878640776699


In [77]:
print ("Acurracy on the actual test data: ", clf_dt.score(x_test,y_test))

Acurracy on the actual test data:  0.9759852216748769


In [78]:
y_pred= dt.predict(x_test)  


In [79]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, y_pred)  
cm

array([[37,  0,  0, ...,  0,  0,  0],
       [ 0, 42,  0, ...,  0,  0,  0],
       [ 0,  0, 42, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 41,  0,  0],
       [ 0,  0,  0, ...,  0, 36,  0],
       [ 0,  0,  0, ...,  0,  0, 44]], dtype=int64)

In [80]:
export_graphviz(dt, 
                out_file='tree-top5.dot', 
                feature_names=cols,
                max_depth = 5
               )

In [83]:
!dot -Tpng tree-top5.dot -o tree-top5.png

'dot' is not recognized as an internal or external command,
operable program or batch file.


In [73]:
from IPython.display import Image
Image(filename='tree-top5.dot')

ValueError: Cannot embed the 'dot' image format

## Building Classifier: Using Random Forest

In [51]:
from sklearn.ensemble import RandomForestClassifier  
classifier= RandomForestClassifier(n_estimators= 10, criterion="entropy")  
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
classifier.fit(x_train, y_train)  

In [52]:
y_pred= classifier.predict(x_test)  

In [53]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, y_pred)  
cm

array([[37,  0,  0, ...,  0,  0,  0],
       [ 0, 42,  0, ...,  0,  0,  0],
       [ 0,  0, 42, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 41,  0,  0],
       [ 0,  0,  0, ...,  0, 40,  0],
       [ 0,  0,  0, ...,  0,  0, 44]], dtype=int64)

In [54]:
print("Accuracy is", classifier.score(x_test,y_test))

Accuracy is 1.0


## Building Classifier: Using K Nearest Neighbors

In [55]:
from sklearn.neighbors import KNeighborsClassifier

In [56]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
knn = KNeighborsClassifier(n_neighbors=300)
knn.fit(x_train, y_train)

In [57]:
print(knn.score(x_test, y_test))

0.8479064039408867


In [58]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, y_pred)  
cm

array([[37,  0,  0, ...,  0,  0,  0],
       [ 0, 42,  0, ...,  0,  0,  0],
       [ 0,  0, 42, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 41,  0,  0],
       [ 0,  0,  0, ...,  0, 40,  0],
       [ 0,  0,  0, ...,  0,  0, 44]], dtype=int64)

## Building Classifier: Using XGBoost

In [59]:
import xgboost as xgb
from xgboost import XGBClassifier
model = XGBClassifier(learning_rate=0.1, max_depth=20,verbosity=2,random_state=42,
                    scale_pos_weight=1.5,use_label_encoder=False, eval_metric='error')



In [60]:
model.fit(x, y)

Parameters: { "scale_pos_weight" } are not used.

[22:30:39] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-03de431ba26204c4d-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 6 extra nodes, 0 pruned nodes, max_depth=3
[22:30:39] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-03de431ba26204c4d-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 8 extra nodes, 0 pruned nodes, max_depth=4
[22:30:39] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-03de431ba26204c4d-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 10 extra nodes, 0 pruned nodes, max_depth=5
[22:30:39] INFO: C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-03de431ba26204c4d-1/xgboost/xgboost-ci-windows/src/tree/updater_prune.cc:98: tree pruning end, 8 extra nodes, 0 pruned nodes, max_depth=4
[22:30:39] INFO: C:/buildkite-agent/builds/buildkite-wind

In [242]:
y_pred=model.predict(x_test)

In [243]:
print(accuracy_score(y_test, y_pred))

1.0


In [61]:
from sklearn.metrics import confusion_matrix  
cm= confusion_matrix(y_test, y_pred)  
cm

array([[37,  0,  0, ...,  0,  0,  0],
       [ 0, 42,  0, ...,  0,  0,  0],
       [ 0,  0, 42, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 41,  0,  0],
       [ 0,  0,  0, ...,  0, 40,  0],
       [ 0,  0,  0, ...,  0,  0, 44]], dtype=int64)