In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

#### Load Symptom Dataset

In [2]:
data = pd.read_csv('./Data/dataset.csv', sep = ',')

####  Load Symptom Severity 

In [3]:
severity = pd.read_csv('./Data/Symptom-severity.csv')

####  Load Exam Mapping

In [4]:
mapping = pd.read_csv('./Data/map.csv', sep = ';')

# Pre-processing

### One-hot encoding of Symptoms + Mapping the severity of the symptoms

In [5]:
symptomsCols = pd.unique(data.drop('Disease',1).values.flatten())
onlySymptomsCols=[]
for elem in (symptomsCols[symptomsCols==symptomsCols]):
    onlySymptomsCols.append(elem.replace(' ',''))

weightsFromSymptom = dict()

indexing = severity.to_dict()['Symptom']
weights = severity.to_dict()['Weight']
for elem in indexing:
    weightsFromSymptom[indexing[elem].replace(' ','')]=weights[elem]

for elem in onlySymptomsCols:
    if(elem.replace(' ', '') not in indexing.values()):
        weightsFromSymptom[elem.replace(' ','')]=1

symptomsCols = np.insert(onlySymptomsCols, 0, 'Disease')
symptomsCols

dataf = []
for elem in data.values:
    line=[]
    line.append(elem[0])
    elemlist=[]
    oldline = elem.tolist()
    for e in (oldline):
        if(e==e):
            elemlist.append(e.replace(' ', ''))
    del elemlist[0]
    for symptom in onlySymptomsCols:
        if(elemlist.count(symptom)>0):
            weight = weightsFromSymptom[symptom.replace(' ','')]
            line.append(weight)
        else:
            line.append(0)
    dataf.append(line)

processedDataf = pd.DataFrame(data=dataf, columns=symptomsCols)
processedDataf.to_csv("./Data/processed.csv", index=False )
processedDataf

Unnamed: 0,Disease,itching,skinrash,nodalskineruptions,dischromicpatches,continuoussneezing,shivering,chills,wateringfromeyes,stomachpain,...,bladderdiscomfort,foulsmellofurine,continuousfeelofurine,skinpeeling,silverlikedusting,smalldentsinnails,inflammatorynails,blister,redsorearoundnose,yellowcrustooze
0,Fungal infection,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4916,Acne,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
4918,Psoriasis,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,1,1,0,0,0


### One-hot encoding of exams and diseases

In [6]:
mapeamentoCols = pd.unique(mapping.drop('Disease',1).values.flatten())

onlyTestCols=[]
onlyTestCols = mapeamentoCols[mapeamentoCols==mapeamentoCols]
dfCols = np.insert(onlyTestCols, 0, 'Disease')

dfValues = []
for elem in mapping.values:
    line=[]
    line.append(elem[0])
    elemlist= elem.tolist()
    del elemlist[0]
    for test in onlyTestCols:
        if(elemlist.count(test)>0):
            line.append(1)
        else:
            line.append(0)
    dfValues.append(line)

testsDf = pd.DataFrame(data=dfValues, columns=dfCols)
testsDf.to_csv("./Data/mapProcessed.csv", index=False)

testsDf

Unnamed: 0,Disease,Blood Tests,Skin biopsy,Patch test,Complete blood count (CBC),Polymerase chain reaction (PCR),Skin Prick Test (SPT),Intradermal Skin Test,Physical Examination,TSH test,...,Pleural fluid culture,Pulse oximetry,Anti-cyclic citrullinated peptide (anti-CCP),Erythrocyte sedimentation rate (ESR),C-reactive protein (CRP),Antinuclear antibody (ANA),HLA-B27,Stool culture,Antigen Tests,Mantoux tuberculin skin test
0,Drug Reaction,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Malaria,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Allergy,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Hypothyroidism,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Psoriasis,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,GERD,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Chronic cholestasis,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,hepatitis A,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,Osteoarthristis,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,(vertigo) Paroymsal Positional Vertigo,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Dataframe with all Symptoms + Exams associated with a Disease

In [7]:
finalDfCols= np.concatenate([onlyTestCols,onlySymptomsCols])

dfValues = []
for elem in processedDataf.values:
    for testLine in testsDf.values:
        if(testLine[0]==elem[0]):
            line= np.concatenate([testLine[1:], elem[1:]])
            dfValues.append(line)

finalDf = pd.DataFrame(data=dfValues, columns=finalDfCols)
finalDf.to_csv("./Data/final.csv", index=False)
finalDf

Unnamed: 0,Blood Tests,Skin biopsy,Patch test,Complete blood count (CBC),Polymerase chain reaction (PCR),Skin Prick Test (SPT),Intradermal Skin Test,Physical Examination,TSH test,T4 test,...,bladderdiscomfort,foulsmellofurine,continuousfeelofurine,skinpeeling,silverlikedusting,smalldentsinnails,inflammatorynails,blister,redsorearoundnose,yellowcrustooze
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4676,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4677,1,0,0,0,0,0,0,0,0,0,...,1,1,1,0,0,0,0,0,0,0
4678,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,0,0,0


In [8]:
print(onlyTestCols)
print(len(onlyTestCols))
print(len(onlySymptomsCols))

['Blood Tests' 'Skin biopsy' 'Patch test' 'Complete blood count (CBC)'
 'Polymerase chain reaction (PCR)' 'Skin Prick Test (SPT)'
 'Intradermal Skin Test' 'Physical Examination' 'TSH test' 'T4 test'
 'Thyroid scan' 'Thyroid ultrasound' 'Esophagram' 'Esophageal manometry'
 'pH monitoring' 'Endoscopy' 'Biopsy of upper disgestive system'
 'X-ray of upper digestive system' 'Serum bilirubin test'
 'Serum albumin test' 'Serum alkaline phosphatase test'
 'Serum aminotransferases (transaminases)' 'Prothrombin time (PTT) test'
 'Alanine transaminase (ALT) test' 'Liver Ultrasound' 'Liver Biopsy'
 'MRI Scan' 'CT Scan' 'X-ray' 'Electronystagmography (ENG)'
 'Videonystagmography (VNG)' 'Fasting plasma glucose (FPG) test'
 'Hemoglobin A1C test' 'Random plasma glucose (RPG) test' 'Blood Test'
 'Electrocardiogram (ECG)' 'Echocardiogram' 'Ambulatory monitoring'
 'Urine analysis' 'Upper gastrointestinal endoscopy'
 'Upper gastrointestinal biopsy' 'CT scan' 'Helicobacter pylori tests'
 'Anoscopy' 'Rigid 

In [9]:
print('Number of Symptoms:', len(severity['Symptom']))

for elem in (severity['Symptom'].tolist()) :
        if(elem  not in onlySymptomsCols  ):
            print(elem)

Number of Symptoms: 131
skin_rash
nodal_skin_eruptions
continuous_sneezing
joint_pain
stomach_pain
ulcers_on_tongue
muscle_wasting
burning_micturition
spotting_urination
weight_gain
cold_hands_and_feets
mood_swings
weight_loss
patches_in_throat
irregular_sugar_level
high_fever
sunken_eyes
yellowish_skin
dark_urine
loss_of_appetite
pain_behind_the_eyes
back_pain
abdominal_pain
mild_fever
yellow_urine
yellowing_of_eyes
acute_liver_failure
fluid_overload
swelling_of_stomach
swelled_lymph_nodes
blurred_and_distorted_vision
throat_irritation
redness_of_eyes
sinus_pressure
runny_nose
chest_pain
weakness_in_limbs
fast_heart_rate
pain_during_bowel_movements
pain_in_anal_region
bloody_stool
irritation_in_anus
neck_pain
swollen_legs
swollen_blood_vessels
puffy_face_and_eyes
enlarged_thyroid
brittle_nails
swollen_extremeties
excessive_hunger
extra_marital_contacts
drying_and_tingling_lips
slurred_speech
knee_pain
hip_joint_pain
muscle_weakness
stiff_neck
swelling_joints
movement_stiffness
spinnin

### Split the data

In [10]:
symptomCount = len(onlySymptomsCols)
testCount = len(onlyTestCols)

#print(testCount)
#print(symptomCount)

[train,test] = train_test_split(finalDf,random_state=420)
trainA = np.array(train)
trainAX = trainA[:,testCount:]
trainAY = trainA[:,:testCount]

trainX=train.iloc[:,testCount:]
trainY=train.iloc[:,:testCount]
print(trainY[0:5])

testX=test.iloc[:,testCount:]
testY=test.iloc[:,:testCount]


      Blood Tests  Skin biopsy  Patch test  Complete blood count (CBC)  \
1870            1            0           0                           0   
373             0            1           0                           0   
781             1            0           0                           0   
825             1            1           1                           0   
1667            0            0           0                           0   

      Polymerase chain reaction (PCR)  Skin Prick Test (SPT)  \
1870                                0                      0   
373                                 0                      0   
781                                 0                      0   
825                                 0                      0   
1667                                0                      0   

      Intradermal Skin Test  Physical Examination  TSH test  T4 test  ...  \
1870                      0                     0         0        0  ...   
373             

In [11]:
model = Sequential()
baseNoNeurs=2048+1024
model.add(Dense(baseNoNeurs, input_shape=(symptomCount,)))
model.add(Activation('relu'))                            

model.add(Dense(baseNoNeurs))
model.add(Activation('relu'))

model.add(Dense(baseNoNeurs*4))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(baseNoNeurs*2))
model.add(Dropout(0.2))
model.add(Activation('relu'))

#Inner
model.add(Dense(baseNoNeurs*1/2))
model.add(Dropout(0.2))
model.add(Activation('relu'))




model.add(Dense(baseNoNeurs*2))
model.add(Activation('sigmoid'))


model.add(Dense(baseNoNeurs))
model.add(Activation('relu'))


model.add(Dense(testCount))
model.add(Activation('softmax'))
# Compile the model.
model.compile(
  optimizer=tf.keras.optimizers.Adagrad(),
  loss=tf.keras.losses.CategoricalCrossentropy(),
  metrics=['accuracy',tf.metrics.CategoricalAccuracy()],
)
print(model.summary())



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3072)              405504    
_________________________________________________________________
activation (Activation)      (None, 3072)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 3072)              9440256   
_________________________________________________________________
activation_1 (Activation)    (None, 3072)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 12288)             37761024  
_________________________________________________________________
dropout (Dropout)            (None, 12288)             0         
_________________________________________________________________
activation_2 (Activation)    (None, 12288)             0

In [12]:
modelPath='./models/'+'M_80epHarderLossMin'
checkPointer=ModelCheckpoint(modelPath,monitor='val_accuracy', verbose=2, save_best_only=True,
    save_weights_only=True)

earlyStopper=EarlyStopping(
    monitor='val_loss', min_delta=10, patience=5, restore_best_weights=True, mode='min',verbose=2
)
tbCallBackC = TensorBoard(log_dir=modelPath+'/log', histogram_freq=0, write_graph=True, write_images=True)

In [13]:
##TRAIN
'''model.save(modelPath+"Untrained")
history = model.fit(
              trainX,
              trainY,
              epochs=82,
              batch_size=3,
              verbose=1, validation_data=(testX,testY),
    callbacks=[checkPointer, earlyStopper, tbCallBackC])

model.save(modelPath)'''

'model.save(modelPath+"Untrained")\nhistory = model.fit(\n              trainX,\n              trainY,\n              epochs=82,\n              batch_size=3,\n              verbose=1, validation_data=(testX,testY),\n    callbacks=[checkPointer, earlyStopper, tbCallBackC])\n\nmodel.save(modelPath)'

In [14]:
#model.load_weights(modelPath)

In [15]:
def showHistory(history):
    fig = plt.figure()
    plt.subplot(2,1,1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='lower right')

    plt.subplot(2,1,2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper right')

    plt.tight_layout()

In [16]:
print(tf.version)
print(tf.executing_eagerly())

<module 'tensorflow._api.v2.version' from '/opt/anaconda3/lib/python3.8/site-packages/tensorflow/_api/v2/version/__init__.py'>
True


In [17]:
def actualAccuracies(model, predictions):
    thresh =(predictions.max()+ predictions.min())/2
    predictions[predictions>thresh]=1
    predictions[predictions<=thresh]=0
    
    correctByLine=[]
    for i in range(len(testY)):
        matches = predicted_classes[i]==testY.to_numpy()[i]
        correctByLine.append(len(matches[matches==True])/len(matches))
    return([len(np.nonzero([x for x in correctByLine if x==1])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.99])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.95])[0]),
            len(np.nonzero([x for x in correctByLine if x>0.90])[0]),
            len(np.nonzero([x for x in correctByLine if x!=1])[0])
           ])

def uniquePredictions(predictions, yTest):
    rawPreds=[]
    for line in predictions:
        rawPreds.append(str(line))
    tPreds=predictions
    threshArray(tPreds)
    threshPreds=[]
    for line in tPreds:
        threshPreds.append(int("".join(str(int(x)) for x in line), 2))
    expected=[]
    for line in yTest:
        expected.append(int("".join(str(int(x)) for x in line), 2))
    return([pd.unique(rawPreds),pd.unique(threshPreds),pd.unique(expected)])
    

In [20]:
def threshArray(array):
    for line in array:
        tresh=sum(line)/len(line)
        print(tresh)
        line[line>=tresh]=1
        line[line<tresh]=0

# Load trained model and evaluate with test dataset

loss_and_metrics = model.evaluate(testX, testY, verbose=2)

print("Test Loss", loss_and_metrics[0])
print("Test Accuracy", loss_and_metrics[1])

# Do a more deep evaluation
print(testX.shape)

# load the model and create predictions on the test set
predicted_classes = model.predict(testX.to_numpy()) #mnist_model.predict_class provides the int decoded value
nontreshPreds = model.predict(testX.to_numpy())

threshArray(predicted_classes)

preds = model.predict(testX.to_numpy())
accs = actualAccuracies(model,model.predict(testX.to_numpy()))
print(preds)
uniqs = uniquePredictions(model.predict(testX.to_numpy()), testY.to_numpy())
                      

rindex=np.random.randint(len(predicted_classes),size=(1,1))[0,0]
print(rindex)

print("Should be:"+str(testY.to_numpy()[rindex]))
print("Predicted:"+str(nontreshPreds[rindex]))

print("Other results:"+str(list(nontreshPreds[15:24])))
# see which we predicted correctly and which not
correctByLine=[]
for i in range(len(testY)):
    matches = predicted_classes[i]==testY.to_numpy()[i]
    correctByLine.append(len(matches[matches==True])/len(matches))
print("Dif:"+str(predicted_classes[rindex]==testY.to_numpy()[rindex]))
                      

print(accs[3]," classified 90% correctly")
print(accs[2]," classified 95% correctly")
print(accs[1]," classified 99% correctly")
print(accs[0]," classified 100% correctly")
print(accs[4]," classified incorrectly (Not 100%)")

print("Unique RAW predictions: ",uniqs[0], "\nUnique Thresholded predictions: ", uniqs[1],"\nUnique expected: ", uniqs[2])


37/37 - 2s - loss: 20.9777 - accuracy: 0.0000e+00 - categorical_accuracy: 0.0000e+00
Test Loss 20.977746963500977
Test Accuracy 0.0
(1170, 131)
0.009803921691890732
0.009803921247914653
0.009803920891820727
0.009803921973798424
0.009803921447646823
0.009803921332372827
0.009803921072150343
0.00980392188591627
0.009803921390580488
0.009803921630259092
0.009803921836839221
0.00980392160971521
0.009803921650802972
0.009803921545800917
0.009803921350634055
0.009803922049125986
0.009803921791186155
0.009803922041136698
0.00980392130726364
0.009803921251338632
0.009803921998907611
0.009803921586888678
0.009803921671346853
0.009803921029921257
0.009803921413407022
0.00980392147960397
0.009803920948887062
0.009803920624750283
0.009803920922636548
0.009803921390580488
0.009803921255903939
0.009803921589171332
0.009803920961441654
0.009803920997964108
0.009803920961441654
0.009803921671346853
0.00980392153438765
0.0098039208906794
0.009803921589171332
0.009803921429385595
0.009803921255903939
0.

0.00980392110068351
0.0098039208735595
0.009803921589171332
0.009803921518409076
0.00980392153438765
0.009803922041136698
0.009803921502430504
0.009803921373460587
0.009803921552648878
0.009803920891820727
0.0098039217717836
0.00980392110068351
0.00980392230934847
0.00980392214728008
0.009803921680477466
0.00980392073203499
0.009803922068528538
0.009803921671346853
0.009803921126934024
0.009803920948887062
0.009803921671346853
0.00980392144878815
0.00980392188591627
0.009803920730893664
0.009803920688664577
0.009803921788903502
0.009803921545800917
0.009803921350634055
0.009803921864231062
0.009803921072150343
0.009803921254762612
0.009803920922636548
0.009803920891820727
0.009803921791186155
0.009803921502430504
0.009803921794610135
0.009803920967148287
0.009803922068528538
0.009803921691890732
0.009803921417972329
0.009803921251338632
0.009803921417972329
0.009803922041136698
0.009803921255903939
0.009803921543518264
0.009803921201120258
0.009803921254762612
0.009803921559496838
0.00

1045
Should be:[1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Predicted:[0.0033469  0.01360871 0.00322164 0.00623428 0.01231482 0.0047171
 0.00497267 0.01419001 0.00743385 0.00704874 0.00924292 0.01796159
 0.00353292 0.01929647 0.00483012 0.01568426 0.00304215 0.00412158
 0.00442042 0.00682194 0.00514119 0.01894156 0.01014207 0.01335769
 0.01545819 0.00434628 0.00418561 0.01283895 0.0118505  0.01559851
 0.02569182 0.0088515  0.00939123 0.00772923 0.00428319 0.00760485
 0.02895869 0.01016578 0.00972602 0.00603897 0.00918084 0.01708866
 0.0138523  0.00245735 0.01136032 0.00243909 0.01015085 0.0056423
 0.00975166 0.00476508 0.00275425 0.01647249 0.01152375 0.01596796
 0.01294914 0.0087548  0.00281968 0.00504926 0.00403273 0.00405344
 0.01144228 0.00147227 0.00760623 0.00563954 0.00344388 0.0073975
 0.00740019 0.00832978 0.02868843

In [18]:
results = predicted_classes==testY.to_numpy()
#print(results[results==True])
testsNeeded = [];
for line in results:
    #print(line)
    #print(onlyTestCols.shape)
    #print(onlyTestCols[line])
    testsNeeded.append(onlyTestCols[line])
#testsNeeded= np.array(testsNeeded)
print(testsNeeded[0])

NameError: name 'predicted_classes' is not defined