## Disease Prediction from Symptoms

Dataset Source: Raw data from [here](http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html) 

In [1]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv("C:/Users/Scarlett RED/Desktop/project.csv", encoding='cp1252')
data=df.iloc[7:,]
data.head()
new_header = data.iloc[0] #grab the first row for the header
data = data[1:] #take the data less the header row
data.columns = new_header #set the header row as the data header
new_header

Unnamed: 0                        Disease
Unnamed: 1    Count of Disease Occurrence
Unnamed: 2                        Symptom
Name: 7, dtype: object

In [3]:
fill=data.fillna(method='ffill')

In [4]:
fill.head()

7,Disease,Count of Disease Occurrence,Symptom
8,UMLS:C0020538_hypertensive disease,3363,UMLS:C0008031_pain chest
9,UMLS:C0020538_hypertensive disease,3363,UMLS:C0392680_shortness of breath
10,UMLS:C0020538_hypertensive disease,3363,UMLS:C0012833_dizziness
11,UMLS:C0020538_hypertensive disease,3363,UMLS:C0004093_asthenia
12,UMLS:C0020538_hypertensive disease,3363,UMLS:C0085639_fall


In [5]:
#fill['Symptom']

In [6]:
data.head()

7,Disease,Count of Disease Occurrence,Symptom
8,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
9,,,UMLS:C0392680_shortness of breath
10,,,UMLS:C0012833_dizziness
11,,,UMLS:C0004093_asthenia
12,,,UMLS:C0085639_fall


In [7]:
# Process Disease and Symptom Names
def process_data(fill):
    data_list = []
    data_name = fill.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [8]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in fill.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(fill=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(fill=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [9]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [10]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': '3363',
 'diabetes': '1421',
 'depression mental': '1337',
 'depressive disorder': '1337',
 'coronary arteriosclerosis': '1284',
 'coronary heart disease': '1284',
 'pneumonia': '1029',
 'failure heart congestive': '963',
 'accident\xa0cerebrovascular': '885',
 'asthma': '835',
 'myocardial infarction': '759',
 'hypercholesterolemia': '685',
 'infection': '630',
 'infection urinary tract': '597',
 'anemia': '544',
 'chronic obstructive airway disease': '524',
 'dementia': '504',
 'insufficiency renal': '445',
 'confusion': '408',
 'degenerative\xa0polyarthritis': '405',
 'hypothyroidism': '398',
 'anxiety state': '390',
 'malignant neoplasms': '354',
 'primary malignant neoplasm': '354',
 'acquired\xa0immuno-deficiency syndrome': '350',
 'HIV': '350',
 'hiv infections': '350',
 'cellulitis': '341',
 'gastroesophageal reflux disease': '325',
 'septicemia': '311',
 'systemic infection': '311',
 'sepsis (invertebrate)': '311',
 'deep vein thrombosis': '310',
 'deh

In [11]:
df1 = pd.DataFrame(list(disease_symptom_dict.items()), columns=['Disease','Symptom'])

In [38]:
df1

Unnamed: 0,Disease,Symptom
0,hypertensive disease,"[pain chest, shortness of breath, dizziness, a..."
1,diabetes,"[polyuria, polydypsia, shortness of breath, pa..."
2,depression mental,"[feeling suicidal, suicidal, hallucinations au..."
3,depressive disorder,"[feeling suicidal, suicidal, hallucinations au..."
4,coronary arteriosclerosis,"[pain chest, angina pectoris, shortness of bre..."
5,coronary heart disease,"[pain chest, angina pectoris, shortness of bre..."
6,pneumonia,"[cough, fever, decreased translucency, shortne..."
7,failure heart congestive,"[shortness of breath, orthopnea, jugular venou..."
8,accident cerebrovascular,"[dysarthria, asthenia, speech slurred, facial ..."
9,asthma,"[wheezing, cough, shortness of breath, chest t..."


In [17]:
for x in range(0,len(df1['Symptom'])):
    dm=pd.get_dummies(df1['Symptom'][x])
    dm.insert(0,"Disease",df1['Disease'][x], True)
    new=dm.groupby(['Disease']).sum()
    
    table=pd.concat([new,table])
table


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0_level_0,Unnamed: 1_level_0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
decubitus ulcer,,,,,,,,,,,...,,,,,,,,,,
affect labile,,,,,,,,,,,...,,,,,,,,,,
delusion,,,,,,,,,,,...,,,1.0,,,,,,1.0,
adhesion,,,,,,,,,,,...,,1.0,,,,,,,,
ileus,,,,,1.0,,1.0,,,,...,,,,,,,,,,
tachycardia sinus,,,,,,,,,,,...,,1.0,,,,,,,,
biliary calculus,,,,,,,,,,,...,,1.0,,,,,,,,
cholelithiasis,,,,,,,,,,,...,,1.0,,,,,,,,
pancytopenia,,,,,,,,,,,...,,,,,,,,,,
migraine disorders,,,,,,,,,,,...,,1.0,,,,,,,,


In [23]:
fill=table.fillna(0)
fill

Unnamed: 0_level_0,Unnamed: 1_level_0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,...,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
Disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
decubitus ulcer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
affect labile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
delusion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
adhesion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ileus,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tachycardia sinus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
biliary calculus,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cholelithiasis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
pancytopenia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
migraine disorders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
import pandas as pd
x=[]
for col in fill.columns:
    print(col)
    x=col
    


Heberden's node
Murphy's sign
Stahli's line
abdomen acute
abdominal bloating
abdominal tenderness
abnormal sensation
abnormally hard consistency
abortion
abscess bacterial
absences finding
achalasia
ache
adverse effect
adverse reaction
agitation
air fluid level
alcohol binge episode
alcoholic withdrawal symptoms
ambidexterity
angina pectoris
anorexia
anosmia
aphagia
apyrexial
arthralgia
ascites
asterixis
asthenia
asymptomatic
ataxia
atypia
aura
awakening early
barking cough
bedridden
behavior hyperactive
behavior showing increased motor activity
blackout
blanch
bleeding of vagina
bowel sounds decreased
bradycardia
bradykinesia
breakthrough pain
breath sounds decreased
breath-holding spell
breech presentation
bruit
burning sensation
cachexia
cardiomegaly
cardiovascular event
cardiovascular finding
catatonia
catching breath
charleyhorse
chest discomfort
chest tightness
chill
choke
cicatrisation
clammy skin
claudication
clonus
clumsiness
colic abdominal
consciousness clear
constipation
c

In [40]:
x

'yellow sputum'

In [27]:
def unique(col):
    new=[]
    unique_list = [] 
    for x in col:
        if x not in unique_list:
            unique_list.append(x)
            
    for x in unique_list:
        new.append(x)
print(new)

                 fever  frail  systolic murmur
Disease                                       
decubitus ulcer      2      1                1


In [22]:
unique_list

NameError: name 'unique_list' is not defined

In [18]:
table.info()

<class 'pandas.core.frame.DataFrame'>
Index: 151 entries, decubitus ulcer to decubitus ulcer
Columns: 405 entries,  to yellow sputum
dtypes: float64(405)
memory usage: 479.0+ KB


In [29]:

for vals in disease_symptom_count.items():
    print(vals[1])

3363
1421
1337
1337
1284
1284
1029
963
885
835
759
685
630
597
544
524
504
445
408
405
398
390
354
354
350
350
350
341
325
311
311
311
310
297
297
294
290
283
280
269
269
268
267
247
241
228
226
218
208
192
186
186
179
172
171
169
168
166
165
165
165
165
164
163
163
161
160
158
152
152
147
145
144
143
142
140
142
140
140
138
135
133
128
126
124
123
122
119
114
114
114
113
111
108
105
104
103
101
101
99
99
99
96
96
95
94
94
94
93
92
92
90
90
87
87
86
86
85
84
82
80
80
76
76
76
74
71
71
71
70
68
68
68
68
68
67
67
66
61
61
61
61
61
56
56
57
56
45
42


In [30]:
df1.head()

Unnamed: 0,Disease,Symptom
0,hypertensive disease,"[pain chest, shortness of breath, dizziness, a..."
1,diabetes,"[polyuria, polydypsia, shortness of breath, pa..."
2,depression mental,"[feeling suicidal, suicidal, hallucinations au..."
3,depressive disorder,"[feeling suicidal, suicidal, hallucinations au..."
4,coronary arteriosclerosis,"[pain chest, angina pectoris, shortness of bre..."


In [82]:
list2=df1['Disease']
list3=df1['Symptom']

0                      hypertensive disease
1                                  diabetes
2                         depression mental
3                       depressive disorder
4                 coronary arteriosclerosis
5                    coronary heart disease
6                                 pneumonia
7                  failure heart congestive
8                  accident cerebrovascular
9                                    asthma
10                    myocardial infarction
11                     hypercholesterolemia
12                                infection
13                  infection urinary tract
14                                   anemia
15       chronic obstructive airway disease
16                                 dementia
17                      insufficiency renal
18                                confusion
19               degenerative polyarthritis
20                           hypothyroidism
21                            anxiety state
22                      malignan

In [74]:
list=df1['Symptom'][0]
#list

In [75]:
x=list[0]

In [76]:
for a in range(0,len(list3)):
    list1=list3[a]
    for b in range(0,len(list3[a])):
        if(x==list1[b]):
            y=1
            symptom_1.insert(b,y)
            symptom_1
        else:
            y=0
            symptom_1.insert(b,y)
            symptom_1

In [79]:
len(symptom_1)

4395

In [60]:
symptom_1=[]
for a in range(0,len(list)):
    list1=list[a]
    for b in range(0,len(list1)):
        if(x==list1[b]):
            y=1
            symptom_1.insert(b,y)
            symptom_1
        else:
            y=0
            symptom_1.insert(b,y)
            symptom_1


In [95]:
symptom_1

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [96]:
len(symptom_1)

4395