In [1]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import  
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

In [2]:
# Read Raw Dataset
df = pd.read_excel('./dataset/raw_data.xlsx')

In [3]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0011847_diabetes,1421.0,UMLS:C0032617_polyuria
1,,,UMLS:C0085602_polydypsia
2,,,UMLS:C0008031_pain chest
3,,,UMLS:C0027497_nausea
4,,,UMLS:C0241526_unresponsiveness


In [4]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [5]:
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0011847_diabetes,1421.0,UMLS:C0032617_polyuria
1,UMLS:C0011847_diabetes,1421.0,UMLS:C0085602_polydypsia
2,UMLS:C0011847_diabetes,1421.0,UMLS:C0008031_pain chest
3,UMLS:C0011847_diabetes,1421.0,UMLS:C0027497_nausea
4,UMLS:C0011847_diabetes,1421.0,UMLS:C0241526_unresponsiveness


In [6]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [7]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [8]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'diabetes': ['polyuria',
              'polydypsia',
              'pain chest',
              'nausea',
              'unresponsiveness',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'accident\xa0cerebrovascular': ['dysarthria',
              'asthenia',
              'speech slurred',
              'facial paresis',
              'hemiplegia',
              'unresponsiveness',
              'seizure',
              'numbness'],
             'chronic obstructive airway disease': ['shortness of breath',
              'wheezing',
              'cough',
              'dyspnea',
              'hypercapnia',
              'chest tightness'],
             'carcinoma': ['mass of body structure',
              'pain',
              'lesion',
              'thicken',
              'decreased body weight',
              'hoarseness',
              'general discomfort',
              'metastatic lesion',
      

In [9]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'diabetes': 1421.0,
 'accident\xa0cerebrovascular': 885.0,
 'chronic obstructive airway disease': 524.0,
 'carcinoma': 269.0,
 'failure heart': 138.0,
 "Alzheimer's disease": 101.0,
 'obesity morbid': 76.0,
 'chronic alcoholic intoxication': 70.0}

In [10]:
# Save cleaned data as CSV
f = open('./dataset/cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [11]:
# Read Cleaned Data as DF
df = pd.read_csv('dataset/cleaned_data.csv',encoding='latin1')
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,diabetes,polydypsia,1421.0
1,diabetes,pain chest,1421.0
2,diabetes,nausea,1421.0
3,diabetes,unresponsiveness,1421.0
4,diabetes,vertigo,1421.0


In [12]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [13]:
from sklearn import preprocessing

In [14]:
n_unique = len(df['symptom'].unique())
n_unique

51

In [15]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [16]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['symptom'])
print(integer_encoded)

[37 34 29 47 48 49 24  8  2 41 10 18 47 38 30 39 50  5  9 20  4 26 33 25
 42  6 19 15 27 36 31 11  9 39 23 21  1  7  0 14 44 10 16 28 43 32  5 46
 50 13 43 17 22 12 40 45 35  3]


In [17]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [18]:
onehot_encoded[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [19]:
len(onehot_encoded[0])

51

In [20]:
cols = np.asarray(df['symptom'].unique())
cols

array(['polydypsia', 'pain chest', 'nausea', 'unresponsiveness',
       'vertigo', 'vomiting', 'labored breathing', 'dysarthria',
       'asthenia', 'speech slurred', 'facial paresis', 'hemiplegia',
       'seizure', 'numbness', 'shortness of breath', 'wheezing', 'cough',
       'dyspnea', 'hypercapnia', 'chest tightness',
       'mass of body structure', 'pain', 'lesion', 'thicken',
       'decreased body weight', 'hoarseness', 'general discomfort',
       'metastatic lesion', 'paresthesia', 'orthopnea', 'fatigue',
       'jugular venous distention', 'hypotension', 'angina pectoris',
       'drool', 'agitation', 'frail', 'tremor resting', 'groggy',
       'muscle twitch', 'tremor', 'out of breath', 'unhappy',
       'feels hot/feverish', 'hallucinations auditory', 'irritable mood',
       'feeling suicidal', 'sleeplessness', 'unconscious state', 'panic',
       'breath sounds decreased'], dtype=object)

In [21]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,polydypsia,pain chest,nausea,unresponsiveness,vertigo,vomiting,labored breathing,dysarthria,asthenia,speech slurred,...,out of breath,unhappy,feels hot/feverish,hallucinations auditory,irritable mood,feeling suicidal,sleeplessness,unconscious state,panic,breath sounds decreased


In [22]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [23]:
df_ohe.head()

Unnamed: 0,polydypsia,pain chest,nausea,unresponsiveness,vertigo,vomiting,labored breathing,dysarthria,asthenia,speech slurred,...,out of breath,unhappy,feels hot/feverish,hallucinations auditory,irritable mood,feeling suicidal,sleeplessness,unconscious state,panic,breath sounds decreased
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
len(df_ohe)

58

In [25]:
# Disease Dataframe
df_disease = df['disease']
df_disease.head()

0    diabetes
1    diabetes
2    diabetes
3    diabetes
4    diabetes
Name: disease, dtype: object

In [26]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,polydypsia,pain chest,nausea,unresponsiveness,vertigo,vomiting,labored breathing,dysarthria,asthenia,...,out of breath,unhappy,feels hot/feverish,hallucinations auditory,irritable mood,feeling suicidal,sleeplessness,unconscious state,panic,breath sounds decreased
0,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [27]:
df_concat.drop_duplicates(keep='first',inplace=True)

In [28]:
df_concat.head()

Unnamed: 0,disease,polydypsia,pain chest,nausea,unresponsiveness,vertigo,vomiting,labored breathing,dysarthria,asthenia,...,out of breath,unhappy,feels hot/feverish,hallucinations auditory,irritable mood,feeling suicidal,sleeplessness,unconscious state,panic,breath sounds decreased
0,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,diabetes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [29]:
len(df_concat)

58

In [30]:
cols = df_concat.columns
cols

Index(['disease', 'polydypsia', 'pain chest', 'nausea', 'unresponsiveness',
       'vertigo', 'vomiting', 'labored breathing', 'dysarthria', 'asthenia',
       'speech slurred', 'facial paresis', 'hemiplegia', 'seizure', 'numbness',
       'shortness of breath', 'wheezing', 'cough', 'dyspnea', 'hypercapnia',
       'chest tightness', 'mass of body structure', 'pain', 'lesion',
       'thicken', 'decreased body weight', 'hoarseness', 'general discomfort',
       'metastatic lesion', 'paresthesia', 'orthopnea', 'fatigue',
       'jugular venous distention', 'hypotension', 'angina pectoris', 'drool',
       'agitation', 'frail', 'tremor resting', 'groggy', 'muscle twitch',
       'tremor', 'out of breath', 'unhappy', 'feels hot/feverish',
       'hallucinations auditory', 'irritable mood', 'feeling suicidal',
       'sleeplessness', 'unconscious state', 'panic',
       'breath sounds decreased'],
      dtype='object')

In [31]:
cols = cols[1:]

In [32]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat[:5]

Unnamed: 0,disease,polydypsia,pain chest,nausea,unresponsiveness,vertigo,vomiting,labored breathing,dysarthria,asthenia,...,out of breath,unhappy,feels hot/feverish,hallucinations auditory,irritable mood,feeling suicidal,sleeplessness,unconscious state,panic,breath sounds decreased
0,Alzheimer's disease,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,accident cerebrovascular,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,carcinoma,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chronic alcoholic intoxication,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,chronic obstructive airway disease,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
len(df_concat)

8

In [34]:
df_concat.to_csv("./dataset/training_dataset.csv", index=False)

In [35]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

## Model Training

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [37]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [38]:
len(X_train), len(y_train)

(6, 6)

In [39]:
len(X_test), len(y_test)

(2, 2)

In [40]:
# Initialize the models
clf_dt = RandomForestClassifier(random_state = 1)
clf_dt.fit(X,y)

RandomForestClassifier(random_state=1)

In [41]:
#the accuracy
clf_dt.score(X, y)

1.0

In [42]:
disease_pred = clf_dt.predict(X)
print(disease_pred[3])

chronic alcoholic intoxication


In [43]:
disease_real = y.values
print(disease_real[3])

chronic alcoholic intoxication


In [44]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))

pickling objects

In [45]:
pickle.dump(clf_dt, open('model.pkl','wb'))

Initialize flask

In [46]:
#import libraries
import numpy as np
from flask import Flask, render_template,request
import pickle#Initialize the flask App
app = Flask(__name__)
models = pickle.load(open('model.pkl', 'rb'))