## Disease Prediction from Symptoms

For this project, we'll be using the dataset from here: http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html

Copy the data from all the columns and paste it into an excel sheet -> `raw_data.xlsx`

In [1]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import xlrd
import pickle
%matplotlib inline

In [2]:
# Read Raw Dataset
df = pd.read_excel(r'./dataset/raw_data.xlsx')

In [3]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [4]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [5]:
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [6]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    print("data",data_name)
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [7]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    print("row:\n",row)
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

row:
 Disease                        UMLS:C0020538_hypertensive disease
Count of Disease Occurrence                                  3363
Symptom                                  UMLS:C0008031_pain chest
Name: 0, dtype: object
data ['UMLS:C0020538', 'hypertensive disease']
data ['UMLS:C0008031', 'pain chest']
row:
 Disease                        UMLS:C0020538_hypertensive disease
Count of Disease Occurrence                                  3363
Symptom                         UMLS:C0392680_shortness of breath
Name: 1, dtype: object
data ['UMLS:C0020538', 'hypertensive disease']
data ['UMLS:C0392680', 'shortness of breath']
row:
 Disease                        UMLS:C0020538_hypertensive disease
Count of Disease Occurrence                                  3363
Symptom                                   UMLS:C0012833_dizziness
Name: 2, dtype: object
data ['UMLS:C0020538', 'hypertensive disease']
data ['UMLS:C0012833', 'dizziness']
row:
 Disease                        UMLS:C0020538_hyperten

 Disease                              UMLS:C0002871_anemia
Count of Disease Occurrence                           544
Symptom                        UMLS:C0020625_hyponatremia
Name: 167, dtype: object
data ['UMLS:C0002871', 'anemia']
data ['UMLS:C0020625', 'hyponatremia']
row:
 Disease                           UMLS:C0002871_anemia
Count of Disease Occurrence                        544
Symptom                        UMLS:C0012833_dizziness
Name: 168, dtype: object
data ['UMLS:C0002871', 'anemia']
data ['UMLS:C0012833', 'dizziness']
row:
 Disease                                     UMLS:C0002871_anemia
Count of Disease Occurrence                                  544
Symptom                        UMLS:C0392680_shortness of breath
Name: 169, dtype: object
data ['UMLS:C0002871', 'anemia']
data ['UMLS:C0392680', 'shortness of breath']
row:
 Disease                        UMLS:C0002871_anemia
Count of Disease Occurrence                     544
Symptom                          UMLS:C0030193_p

row:
 Disease                                    UMLS:C0007642_cellulitis
Count of Disease Occurrence                                     341
Symptom                        UMLS:C0376405_patient non compliance
Name: 325, dtype: object
data ['UMLS:C0007642', 'cellulitis']
data ['UMLS:C0376405', 'patient non compliance']
row:
 Disease                         UMLS:C0007642_cellulitis
Count of Disease Occurrence                          341
Symptom                        UMLS:C0020580_hypesthesia
Name: 326, dtype: object
data ['UMLS:C0007642', 'cellulitis']
data ['UMLS:C0020580', 'hypesthesia']
row:
 Disease                         UMLS:C0007642_cellulitis
Count of Disease Occurrence                          341
Symptom                        UMLS:C0034880_hyperacusis
Name: 327, dtype: object
data ['UMLS:C0007642', 'cellulitis']
data ['UMLS:C0034880', 'hyperacusis']
row:
 Disease                        UMLS:C0007642_cellulitis
Count of Disease Occurrence                         341
Symptom

Name: 480, dtype: object
data ['UMLS:C0007097', 'carcinoma']
data ['UMLS:C0858924', 'general discomfort']
row:
 Disease                                UMLS:C0007097_carcinoma
Count of Disease Occurrence                                269
Symptom                        UMLS:C1513183_metastatic lesion
Name: 481, dtype: object
data ['UMLS:C0007097', 'carcinoma']
data ['UMLS:C1513183', 'metastatic lesion']
row:
 Disease                                   UMLS:C0007097_carcinoma
Count of Disease Occurrence                                   269
Symptom                        UMLS:C0850149_non-productive cough
Name: 482, dtype: object
data ['UMLS:C0007097', 'carcinoma']
data ['UMLS:C0850149', 'non-productive cough']
row:
 Disease                           UMLS:C0007097_carcinoma
Count of Disease Occurrence                           269
Symptom                        UMLS:C0009806_constipation
Name: 483, dtype: object
data ['UMLS:C0007097', 'carcinoma']
data ['UMLS:C0009806', 'constipation']
ro

Name: 605, dtype: object
data ['UMLS:C1623038', 'cirrhosis']
data ['UMLS:C0085639', 'fall']
row:
 Disease                           UMLS:C1623038_cirrhosis
Count of Disease Occurrence                           218
Symptom                        UMLS:C0038002_splenomegaly
Name: 606, dtype: object
data ['UMLS:C1623038', 'cirrhosis']
data ['UMLS:C0038002', 'splenomegaly']
row:
 Disease                        UMLS:C1623038_cirrhosis
Count of Disease Occurrence                        218
Symptom                         UMLS:C0033774_pruritus
Name: 607, dtype: object
data ['UMLS:C1623038', 'cirrhosis']
data ['UMLS:C0033774', 'pruritus']
row:
 Disease                             UMLS:C1623038_cirrhosis
Count of Disease Occurrence                             218
Symptom                        UMLS:C0000737_pain abdominal
Name: 608, dtype: object
data ['UMLS:C1623038', 'cirrhosis']
data ['UMLS:C0000737', 'pain abdominal']
row:
 Disease                                  UMLS:C1623038_cirrhosis
Co

data ['UMLS:C0018932', 'hematochezia']
row:
 Disease                        UMLS:C0001418_adenocarcinoma
Count of Disease Occurrence                             166
Symptom                              UMLS:C0231872_egophony
Name: 748, dtype: object
data ['UMLS:C0001418', 'adenocarcinoma']
data ['UMLS:C0231872', 'egophony']
row:
 Disease                        UMLS:C0001418_adenocarcinoma
Count of Disease Occurrence                             166
Symptom                                  UMLS:C0030193_pain
Name: 749, dtype: object
data ['UMLS:C0001418', 'adenocarcinoma']
data ['UMLS:C0030193', 'pain']
row:
 Disease                                             UMLS:C0001418_adenocarcinoma
Count of Disease Occurrence                                                  166
Symptom                        UMLS:C0008767_cicatrisation^UMLS:C0241158_scar...
Name: 750, dtype: object
data ['UMLS:C0001418', 'adenocarcinoma']
data ['UMLS:C0008767', 'cicatrisation', 'UMLS:C0241158', 'scar tissue']
row:

Name: 905, dtype: object
data ['UMLS:C0036341', 'schizophrenia']
data ['UMLS:C0233763', 'hallucinations visual']
row:
 Disease                                              UMLS:C0036341_schizophrenia
Count of Disease Occurrence                                                  147
Symptom                        UMLS:C0041667_underweight^UMLS:C1319518_underw...
Name: 906, dtype: object
data ['UMLS:C0036341', 'schizophrenia']
data ['UMLS:C0041667', 'underweight', 'UMLS:C1319518', 'underweight']
row:
 Disease                        UMLS:C0036341_schizophrenia
Count of Disease Occurrence                            147
Symptom                         UMLS:C0237154_homelessness
Name: 907, dtype: object
data ['UMLS:C0036341', 'schizophrenia']
data ['UMLS:C0237154', 'homelessness']
row:
 Disease                        UMLS:C0012813_diverticulitis
Count of Disease Occurrence                             145
Symptom                        UMLS:C0000737_pain abdominal
Name: 908, dtype: object
data 

Name: 1017, dtype: object
data ['UMLS:C0018801', 'failure heart']
data ['UMLS:C0231807', 'dyspnea on exertion']
row:
 Disease                        UMLS:C0018801_failure heart
Count of Disease Occurrence                            138
Symptom                              UMLS:C0013404_dyspnea
Name: 1018, dtype: object
data ['UMLS:C0018801', 'failure heart']
data ['UMLS:C0013404', 'dyspnea']
row:
 Disease                              UMLS:C0018801_failure heart
Count of Disease Occurrence                                  138
Symptom                        UMLS:C0392680_shortness of breath
Name: 1019, dtype: object
data ['UMLS:C0018801', 'failure heart']
data ['UMLS:C0392680', 'shortness of breath']
row:
 Disease                             UMLS:C0018801_failure heart
Count of Disease Occurrence                                 138
Symptom                        UMLS:C0232258_pansystolic murmur
Name: 1020, dtype: object
data ['UMLS:C0018801', 'failure heart']
data ['UMLS:C0232258', 'pans

data ['UMLS:C1510475', 'diverticulosis']
data ['UMLS:C0027497', 'nausea']
row:
 Disease                          UMLS:C1510475_diverticulosis
Count of Disease Occurrence                               114
Symptom                        UMLS:C0424000_feeling suicidal
Name: 1161, dtype: object
data ['UMLS:C1510475', 'diverticulosis']
data ['UMLS:C0424000', 'feeling suicidal']
row:
 Disease                          UMLS:C1510475_diverticulosis
Count of Disease Occurrence                               114
Symptom                        UMLS:C0150041_feeling hopeless
Name: 1162, dtype: object
data ['UMLS:C1510475', 'diverticulosis']
data ['UMLS:C0150041', 'feeling hopeless']
row:
 Disease                                UMLS:C0038663_suicide attempt
Count of Disease Occurrence                                      114
Symptom                        UMLS:C0233762_hallucinations auditory
Name: 1163, dtype: object
data ['UMLS:C0038663', 'suicide attempt']
data ['UMLS:C0233762', 'hallucinations au

data ['UMLS:C0022658', 'kidney disease']
data ['UMLS:C0151706', 'bleeding of vagina']
row:
 Disease                        UMLS:C0023267_fibroid tumor
Count of Disease Occurrence                             96
Symptom                               UMLS:C0233071_para 2
Name: 1325, dtype: object
data ['UMLS:C0023267', 'fibroid tumor']
data ['UMLS:C0233071', 'para 2']
row:
 Disease                        UMLS:C0023267_fibroid tumor
Count of Disease Occurrence                             96
Symptom                          UMLS:C0019080_haemorrhage
Name: 1326, dtype: object
data ['UMLS:C0023267', 'fibroid tumor']
data ['UMLS:C0019080', 'haemorrhage']
row:
 Disease                        UMLS:C0023267_fibroid tumor
Count of Disease Occurrence                             96
Symptom                             UMLS:C0156543_abortion
Name: 1327, dtype: object
data ['UMLS:C0023267', 'fibroid tumor']
data ['UMLS:C0156543', 'abortion']
row:
 Disease                                        UMLS:C00

row:
 Disease                        UMLS:C0031212_personality disorder
Count of Disease Occurrence                                    84
Symptom                                   UMLS:C0424109_weepiness
Name: 1491, dtype: object
data ['UMLS:C0031212', 'personality disorder']
data ['UMLS:C0424109', 'weepiness']
row:
 Disease                        UMLS:C0031212_personality disorder
Count of Disease Occurrence                                    84
Symptom                              UMLS:C0344315_mood depressed
Name: 1492, dtype: object
data ['UMLS:C0031212', 'personality disorder']
data ['UMLS:C0344315', 'mood depressed']
row:
 Disease                        UMLS:C0031212_personality disorder
Count of Disease Occurrence                                    84
Symptom                               UMLS:C1384489_scratch marks
Name: 1493, dtype: object
data ['UMLS:C0031212', 'personality disorder']
data ['UMLS:C1384489', 'scratch marks']
row:
 Disease                        UMLS:C0031212_p

data ['UMLS:C0032326', 'pneumothorax']
data ['UMLS:C0037580', 'soft tissue swelling']
row:
 Disease                        UMLS:C0032326_pneumothorax
Count of Disease Occurrence                            68
Symptom                          UMLS:C0242453_prostatism
Name: 1658, dtype: object
data ['UMLS:C0032326', 'pneumothorax']
data ['UMLS:C0242453', 'prostatism']
row:
 Disease                        UMLS:C0032326_pneumothorax
Count of Disease Occurrence                            68
Symptom                           UMLS:C0085631_agitation
Name: 1659, dtype: object
data ['UMLS:C0032326', 'pneumothorax']
data ['UMLS:C0085631', 'agitation']
row:
 Disease                             UMLS:C0011206_delirium
Count of Disease Occurrence                             68
Symptom                        UMLS:C1273573_unsteady gait
Name: 1660, dtype: object
data ['UMLS:C0011206', 'delirium']
data ['UMLS:C1273573', 'unsteady gait']
row:
 Disease                        UMLS:C0011206_delirium
Count o

 Disease                          UMLS:C1258215_ileus
Count of Disease Occurrence                       56
Symptom                        UMLS:C0205400_thicken
Name: 1818, dtype: object
data ['UMLS:C1258215', 'ileus']
data ['UMLS:C0205400', 'thicken']
row:
 Disease                            UMLS:C1258215_ileus
Count of Disease Occurrence                         56
Symptom                        UMLS:C0232995_gravida 0
Name: 1819, dtype: object
data ['UMLS:C1258215', 'ileus']
data ['UMLS:C0232995', 'gravida 0']
row:
 Disease                                 UMLS:C1258215_ileus
Count of Disease Occurrence                              56
Symptom                        UMLS:C0000737_pain abdominal
Name: 1820, dtype: object
data ['UMLS:C1258215', 'ileus']
data ['UMLS:C0000737', 'pain abdominal']
row:
 Disease                          UMLS:C0001511_adhesion
Count of Disease Occurrence                          57
Symptom                        UMLS:C0016204_flatulence
Name: 1821, dtype: objec

In [8]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [9]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [10]:
# Save cleaned data as CSV
f = open('./dataset/cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [11]:
# Read Cleaned Data as DF
df1 = pd.read_csv('./dataset/cleaned_data.csv',encoding='latin1')

In [12]:
df1.columns = ['disease', 'symptom', 'occurence_count']
df1.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [13]:
# Remove any rows with empty values
df1.replace(float('nan'), np.nan, inplace=True)
df1.dropna(inplace=True)

In [14]:
from sklearn import preprocessing

In [15]:
n_unique = len(df1['symptom'].unique())
n_unique

404

In [16]:
df1.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [17]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df1['symptom'])
print(integer_encoded)

[328  87  28 ... 361 130 122]


In [18]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [19]:
onehot_encoded[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [20]:
len(onehot_encoded[0])

404

In [21]:
cols = np.asarray(df1['symptom'].unique())
cols

array(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       'angina pectoris', 'pressure chest', 'polyuria', 'polydypsia',
       'pain chest', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       '

In [22]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts


In [23]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [24]:
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
len(df_ohe)

2126

In [26]:
# Disease Dataframe
df_disease = df1['disease']
df_disease.head()

0    hypertensive disease
1    hypertensive disease
2    hypertensive disease
3    hypertensive disease
4    hypertensive disease
Name: disease, dtype: object

In [27]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
df_concat.drop_duplicates(keep='first',inplace=True)

In [29]:
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
cols = df_concat.columns
cols

Index(['disease', 'shortness of breath', 'dizziness', 'asthenia', 'fall',
       'syncope', 'vertigo', 'sweat', 'sweating increased', 'palpitation',
       ...
       'feces in rectum', 'prodrome', 'hypoproteinemia',
       'alcohol binge episode', 'abdomen acute', 'air fluid level',
       'catching breath', 'large-for-dates fetus', 'immobile',
       'homicidal thoughts'],
      dtype='object', length=405)

In [54]:
len(cols)

405

In [31]:
cols = cols[1:]

In [55]:
len(cols)

405

In [58]:
len(cols.unique())

405

In [32]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat[:5]

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,HIV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pneumocystis carinii pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,accident cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acquired immuno-deficiency syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
len(df_concat)

149

In [34]:
df_concat.to_csv("./dataset/training_dataset.csv", index=False)

In [35]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

## Model Training

In [36]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [37]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [38]:
len(X_train), len(y_train)

(119, 119)

In [39]:
len(X_test), len(y_test)

(30, 30)

In [40]:
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X, y)

In [41]:
clf_dt.score(X, y)

0.9731543624161074

In [42]:
disease_pred = clf_dt.predict(X_test)

In [43]:
disease_real = y_test.values

In [44]:
for i in range(0, len(disease_real)):
#     print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))

Pred: malignant neoplasms
Actual: primary malignant neoplasm



In [52]:
# lm.fit(X,y)
pickle.dump(clf_dt,open('model_predict','wb'))
my_model = pickle.load(open('model_predict','rb'))
print(my_model.predict([[['movement_stiffness', 'muscle_weakness', 'painful_walking', 'stiff_neck', 'swelling_joints']]]))

ValueError: could not convert string to float: 'movement_stiffness'

In [50]:
X_test.head(1)

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
