In [1]:
import tensorflow as tf
import keras

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from glob import glob

from sklearn.model_selection import train_test_split,KFold

from keras_preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [2]:
csvpath  = r'D:\Downloads\NIH\Data_Entry_2017.csv'
base_dir = r'D:\Downloads\NIH\images-224\3channel'

In [3]:
# read metadata
df = pd.read_csv(csvpath)

In [4]:
# get images path
all_image_paths = {os.path.basename(x): x for x in
                        glob(os.path.join(base_dir, '*', '*.png'))}

In [5]:
# add path col to df
df['path'] = df['Image Index'].map(all_image_paths.get)

#Remove 'Y' from patients' age
df['Patient Age'] = df['Patient Age'].map(lambda x: int(x[:-1]))

In [6]:
#Get just the firs diagnosis 
df['Finding Labels'] = df['Finding Labels'].apply(lambda x: x.split('|')[0] if '|' in x else x)

In [7]:
# labels binary coding
labels = np.unique(df['Finding Labels'])
labels = [x for x in labels if len(x)>0]
print('Labels ({}.'.format(labels))

Labels (['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'No Finding', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax'].


In [8]:
for c_label in labels:
    if len(c_label) > 1:  # leave out empty labels
        df[c_label] = df['Finding Labels'].map(
            lambda finding: 1.0 if c_label in finding else 0)

In [9]:
#Get the dummies of Patient's gender and view position

df['Gender'] = pd.get_dummies(df['Patient Gender'])['F']
df['View']   = pd.get_dummies(df['View Position'])['AP']

In [10]:
df['path']           = df['path'].astype('str')
df['Finding Labels'] = df['Finding Labels'].astype('str')

In [11]:
s

112120

In [53]:
BATCH_SIZE = 64
IMG_SIZE = (224, 224)
STEPS_PER_EPOCH = np.ceil(image_count/BATCH_SIZE)

In [54]:
train, test = train_test_split(df[['path', 'Finding Labels', 'Gender', 'View']], 
                                test_size = 0.3, 
                                random_state = 42,
                                )

In [55]:
def get_kfold(dataframe = train, BATCH_SIZE = 32, x_cols = ['path', 'Gender', 'View'], y_cols = ['path']):
    kfold = KFold(n_splits=(dataframe.shape[0]//BATCH_SIZE)+1, shuffle=True, random_state= 42)
    k_split = kfold.split(X = dataframe[['path', 'Gender', 'View']], y = dataframe['Finding Labels'])
    return k_split

In [64]:
k_split = get_kfold(train, BATCH_SIZE)

In [65]:
def get_next_batch(dataframe, kfold):
    batch = next(kfold)
    df = dataframe.iloc[batch[1]]
    return df

In [72]:
train_batch = get_next_batch(train, k_split)

In [73]:
core_idg = tf.keras.preprocessing.image.ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15,
                             )

In [106]:
def prepare_for_training(dataframe, x_col = 'path', mtd_cols = ['Gender', 'View'], BATCH_SIZE = BATCH_SIZE):
    train_gen = core_idg.flow_from_dataframe(dataframe, 
                                         directory=None,
                                         x_col = 'path',
                                         y_col = 'Finding Labels',
                                         class_mode = 'categorical',
                                         classes = labels,
                                         target_size = IMG_SIZE,
                                         #color_mode = 'grayscale',
                                         batch_size = dataframe.shape[0])
    mtd = dataframe[mtd_cols].values
    return train_gen, mtd.reshape(BATCH_SIZE,2,1)
    

In [75]:
train_gen, mtd = prepare_for_training(train_batch)

Found 64 validated image filenames belonging to 15 classes.


In [76]:
trainx, trainy = next(train_gen)
print('data generation done!')

data generation done!


In [63]:
#trainy = keras.utils.to_categorical(trainy).reshape(BATCH_SIZE,2,15) Use only if there is no Dense layers 

In [23]:
from keras.layers import *
from keras.models import Sequential
from keras.applications.resnet50 import ResNet50

In [39]:
base_model = tf.keras.applications.ResNet50(
    weights='imagenet',
    include_top=False, 
    input_shape=(224, 224, 3)
)

A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of 4d473c1dd8becc155b73f8504c6f6626 so we will re-download the data.
Downloading data from https://github.com/keras-team/keras-applications/releases/download/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [25]:
base_model.trainable = False

In [78]:
# First define the image model
image_processor =  tf.keras.Sequential()
image_processor.add(base_model)
image_processor.add(tf.keras.layers.Dropout(0.2))
image_processor.add(tf.keras.layers.GlobalAveragePooling2D())
image_processor.add(tf.keras.layers.Dropout(0.5))
image_processor.add(tf.keras.layers.Dense(512))
#image_processor.add(keras.layers.Flatten())


# Now we create the metadata model
mtd_processor = tf.keras.Sequential()
mtd_processor.add(tf.keras.layers.InputLayer(input_shape=(2,1)))
mtd_processor.add(tf.keras.layers.Dropout(0.5))
mtd_processor.add(tf.keras.layers.Dense(10))
mtd_processor.add(tf.keras.layers.Flatten())

added = tf.keras.layers.concatenate([image_processor.output, mtd_processor.output])

#added = keras.layers.Add()([image_processor.output, mtd_processor.output])
out = tf.keras.layers.Dense(15, activation='softmax')(added)
model = tf.keras.models.Model(inputs = [image_processor.input, mtd_processor.input], outputs = out)

In [79]:
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [80]:
model.fit(x = [trainx, mtd], y = trainy,
                    steps_per_epoch = 64, 
                    epochs = 100)

Train on 64 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
E

<tensorflow.python.keras.callbacks.History at 0x19f61fcbd08>