In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [8]:
csvpath   = 'Data_Entry_2017.csv'
base_dir = r'D:\Downloads\NIH\images-224\3channel'

In [9]:
# read metadata
df = pd.read_csv(csvpath)
# get images path
all_image_paths = {os.path.basename(x): x for x in
                        glob(os.path.join(base_dir, '*', '*.png'))}

In [10]:
# data
df['path'] = df['Image Index'].map(all_image_paths.get)
df['path'] = df['path'].astype('str')

df['Patient Age'] = df['Patient Age'].map(lambda x: int(x[:-1]))
df['Gender'] = pd.get_dummies(df['Patient Gender'])['F']
df['View']   = pd.get_dummies(df['View Position'])['AP']

In [11]:
#labels

df['Finding Labels'] = df['Finding Labels'].apply(lambda x: x.split('|')[0] if '|' in x else x)
# labels binary coding
labels = np.unique(df['Finding Labels'])
labels = [x for x in labels if len(x)>0]
print('Labels ({}.'.format(labels))
for c_label in labels:
    if len(c_label) > 1:  # leave out empty labels
        df[c_label] = df['Finding Labels'].map(
            lambda finding: 1.0 if c_label in finding else 0)

Labels (['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'].


In [12]:
# I just converted them to lists because I prefer work w it, 
# u can keep working w dataframe if u feel more confortable w it

labels = df['Finding Labels'].tolist()
imagespath = df['path'].tolist()

In [13]:
# I added three attributes to find more correlations 
# learning from only the gender and the view could not add something useful
input2 = df[['Patient Age', 'Gender', 'View']] 
input2 = input2.values.tolist()

In [14]:
# metadata should be normalized before training
scaler = MinMaxScaler()
input2scaled = scaler.fit_transform(input2)

In [15]:
# I suggest start with a simple binary classification (0 if disease, else 1)
# once the model is stable we can move to a multiclassification case
Y = [ 0 if x == 'No Finding' else 1 for x in labels]
Y = tf.keras.utils.to_categorical(Y)
#print(Y.count(1), Y.count(0)) # found them balanced

In [16]:
labels = [ x if x == 'No Finding' else 'Disease' for x in labels]

In [17]:
#(Thiago) I prefer to use dataframes haha
data_frame           = pd.DataFrame()
data_frame['path']   = df['path']
data_frame['Age']    = input2scaled[:,0]
data_frame['Gender'] = input2scaled[:,1]
data_frame['View']   = input2scaled[:,2]
data_frame['Y0']     = Y[:,0]
data_frame['Y1']     = Y[:,1]
data_frame['Finding Labels'] = labels

In [18]:
seed = 7
np.random.seed(seed)

In [19]:
#train test split for both inputs
train_df, test_df = train_test_split(data_frame, 
                                     test_size = 0.3, 
                                     random_state = seed)

In [20]:
image_count = df.shape[0]
image_count

112120

In [21]:
BATCH_SIZE = 64
IMG_SIZE = (224, 224)
STEPS_PER_EPOCH = np.ceil(image_count/BATCH_SIZE)

In [22]:
def get_kfold(dataframe = train_df, BATCH_SIZE = BATCH_SIZE):
    kfold = KFold(n_splits=(dataframe.shape[0]//BATCH_SIZE)+1, shuffle=True, random_state= 42)
    k_split = kfold.split(X = dataframe)
    return k_split

In [23]:
k_split = get_kfold()

In [24]:
def get_next_batch(dataframe, kfold):
    batch = next(kfold)
    df = dataframe.iloc[batch[1]]
    return df

In [25]:
train_batch = get_next_batch(train_df, k_split)

In [26]:
core_idg = tf.keras.preprocessing.image.ImageDataGenerator(#rescale=1./255,
                                                           #samplewise_center=True, 
                                                           #samplewise_std_normalization=True, 
                                                           #horizontal_flip = True, 
                                                           #vertical_flip = False, 
                                                           #height_shift_range= 0.05, 
                                                           #width_shift_range=0.1, 
                                                           #rotation_range=5, 
                                                           #shear_range = 0.1,
                                                           #fill_mode = 'reflect',
                                                           #zoom_range=0.15,
                                                          )

In [27]:
def prepare_for_training(dataframe, x_col = 'path', mtd_cols = ['Age', 'Gender', 'View'], BATCH_SIZE = BATCH_SIZE):
    train_gen = core_idg.flow_from_dataframe(dataframe, 
                                         directory=None,
                                         x_col = 'path',
                                         y_col = 'Finding Labels',
                                         class_mode = 'binary',
                                         classes = labels,
                                         target_size = IMG_SIZE,
                                         #color_mode = 'grayscale',
                                         batch_size = BATCH_SIZE)
    mtd = dataframe[mtd_cols].values
    Y   = dataframe[['Y0', 'Y1']].values
    return train_gen, mtd.reshape(BATCH_SIZE,3,1), Y

In [28]:
train_gen, mtd_train, Y_train = prepare_for_training(train_batch)

Found 64 validated image filenames belonging to 2 classes.


In [29]:
trainx, trainy = next(train_gen)
print('data generation done!')

data generation done!


In [30]:
k_split_test = get_kfold(test_df)

In [31]:
test_batch = get_next_batch(test_df, k_split_test)

In [32]:
test_gen, mtd_test, Y_test = prepare_for_training(test_batch)

Found 64 validated image filenames belonging to 2 classes.


In [33]:
testx, testy = next(train_gen)
print('data generation done!')

data generation done!


In [34]:
base_model = tf.keras.applications.ResNet50(
    weights='imagenet',
    include_top=False, 
    input_shape=(224, 224, 3)
)

In [35]:
base_model.trainable = False

In [36]:
# First define the image model
image_processor =  tf.keras.Sequential()
image_processor.add(base_model)
image_processor.add(tf.keras.layers.GlobalAveragePooling2D())
image_processor.add(tf.keras.layers.Dense(512))


# Now we create the metadata model
mtd_processor = tf.keras.Sequential()
mtd_processor.add(tf.keras.layers.InputLayer(input_shape=(3,1)))
mtd_processor.add(tf.keras.layers.Dense(10))
mtd_processor.add(tf.keras.layers.Flatten())

cnn = image_processor
nn = mtd_processor

combined_input = tf.keras.layers.concatenate([cnn.output, nn.output])


out = tf.keras.layers.Dense(64)(combined_input)
out = tf.keras.layers.Dense(2, activation='softmax')(out)
model = tf.keras.models.Model(inputs = [cnn.input, nn.input], outputs = out)

In [37]:
# compile the model using the chosed loss function,
# we can start by minimizing the distance between labels 
# the adding a new loss minimizing the correlation between the two inputs

In [38]:
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [39]:
model.fit(
    x=[trainx, mtd_train], y = Y_train,
    validation_data=([testx, mtd_test], Y_test),
    steps_per_epoch = 64, 
    epochs = 100)

Train on 64 samples, validate on 64 samples
Epoch 1/100
 1/64 [..............................] - ETA: 4:51

InternalError:  Blas GEMM launch failed : a.shape=(3, 1), b.shape=(1, 10), m=3, n=10, k=1
	 [[node model/dense_1/Tensordot/MatMul (defined at <ipython-input-39-f3c685d15615>:5) ]] [Op:__inference_distributed_function_21587]

Function call stack:
distributed_function


In [41]:
Y_train.shape

(64, 2)