In [1]:
import os
import glob
import pandas as pd
from PIL import Image
import cv2
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
from random import sample
import threading
from concurrent import futures

  from ._conv import register_converters as _register_converters


In [2]:
np.random.seed(42)
base_directory = '../chest-xray-pneumonia/chest_xray/chest_xray/' 
train_dir = os.path.join(base_directory,'train')
val_dir = os.path.join(base_directory,'val')
test_dir = os.path.join(base_directory,'test')

In [3]:
def prepare_dataset(data_dir):
    normal_dir = glob.glob(os.path.join(data_dir,'NORMAL')+'/*.jpeg')
    infected_dir = glob.glob(os.path.join(data_dir,'PNEUMONIA')+'/*.jpeg')
    print(len(normal_dir),len(infected_dir))
    df = pd.DataFrame({'filename':normal_dir+infected_dir,'label':['normal']*len(normal_dir) + ['pneumonia']*len(infected_dir)}).sample(frac=1, random_state=42).reset_index(drop=True)
    return df

In [4]:
training_data = prepare_dataset(train_dir)
random_index =  np.array(sample(range(len(training_data)), 10))
#training_data.iloc[random_index]
training_data

1341 3875


Unnamed: 0,filename,label
0,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
1,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
2,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
3,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
4,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
5,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
6,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
7,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
8,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
9,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal


In [5]:
validation_dataset = prepare_dataset(val_dir)
#validation_dataset.iloc[random_index]
validation_dataset

8 8


Unnamed: 0,filename,label
0,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
1,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
2,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
3,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
4,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
5,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
6,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
7,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia
8,../chest-xray-pneumonia/chest_xray/chest_xray/...,normal
9,../chest-xray-pneumonia/chest_xray/chest_xray/...,pneumonia


In [6]:
#Obtaining image statistics in parallel
def get_image_shape(index,image,total_images):
    if(index%1000==0 or index==(total_images-1)):
        print('{}: working on img num: {}'.format(threading.current_thread().name,index))
    return cv2.imread(image).shape

executor = futures.ThreadPoolExecutor(max_workers=None)
training_images_input = [(index,image,len(training_data)) for index,image in enumerate(training_data['filename'])]
print('Starting the computation of image shape')
image_dims_computation = executor.map(get_image_shape,
                                     [record[0] for record in training_images_input],
                                     [record[1] for record in training_images_input],
                                     [record[2] for record in training_images_input])
image_dimension_list = list(image_dims_computation)
print('Min Dimensions:', np.min(image_dimension_list, axis=0))
print('Max Dimensions:', np.max(image_dimension_list, axis=0))
print('Mean Dimensions:', np.mean(image_dimension_list, axis=0))
print('Median Dimensions:', np.median(image_dimension_list, axis=0))

Starting the computation of image shape
ThreadPoolExecutor-0_0: working on img num: 0
ThreadPoolExecutor-0_32: working on img num: 1000
ThreadPoolExecutor-0_5: working on img num: 2000
ThreadPoolExecutor-0_47: working on img num: 3000
ThreadPoolExecutor-0_56: working on img num: 4000
ThreadPoolExecutor-0_44: working on img num: 5000
ThreadPoolExecutor-0_14: working on img num: 5215
Min Dimensions: [127 384   3]
Max Dimensions: [2663 2916    3]
Mean Dimensions: [ 968.07476994 1320.61081288    3.        ]
Median Dimensions: [ 888. 1284.    3.]


In [7]:
def resize_images(index,image,total_images):
    if(index%1000==0 or index==(total_images-1)):
        print('{}: working on img num: {}'.format(threading.current_thread().name,index))
    image = cv2.imread(image)
    image = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image,dsize=(256,256),interpolation=cv2.INTER_CUBIC)
    #image = Image.open(image)
    #image = image.resize((1320,950), Image.ANTIALIAS)
    image = np.array(image,dtype=np.float32)
    return image

training_images = [(index,img,len(training_data)) for index,img in enumerate(training_data['filename'])]
'''
subset_of_images_1 = list()
subset_of_images_2 = list()
subset_of_images_3 = list()
subset_of_images_4 = list()
subset_of_images_5 = list()

subset_of_images_1.append(training_images[0:1000])
subset_of_images_2.append(training_images[1000:2000])
subset_of_images_3.append(training_images[2000:3000])
subset_of_images_4.append(training_images[3000:4000])
subset_of_images_5.append(training_images[4000:5126])

import itertools
subset_of_images_1 = list(itertools.chain.from_iterable(subset_of_images_1))
subset_of_images_2 = list(itertools.chain.from_iterable(subset_of_images_2))
subset_of_images_3 = list(itertools.chain.from_iterable(subset_of_images_3))
subset_of_images_4 = list(itertools.chain.from_iterable(subset_of_images_4))
subset_of_images_5 = list(itertools.chain.from_iterable(subset_of_images_5))
print(len(subset_of_images_1),len(subset_of_images_2),len(subset_of_images_3),len(subset_of_images_4),len(subset_of_images_5))

subset_of_images = list()
subset_of_images.append(subset_of_images_1)
subset_of_images.append(subset_of_images_2)
subset_of_images.append(subset_of_images_3)
subset_of_images.append(subset_of_images_4)
subset_of_images.append(subset_of_images_5)

train_data = list()
print('Resizing training images')
for idx,subset in enumerate(subset_of_images):
    executor = futures.ThreadPoolExecutor(max_workers=None)
    training_images_resize_computation = executor.map(resize_images,
                                               [record[0] for record in subset],
                                               [record[1] for record in subset],
                                               [record[2] for record in subset])
    train_data.append(np.array(list(training_images_resize_computation)))
    print(train_data[idx].shape)
'''
executor = futures.ThreadPoolExecutor(max_workers=None)
print('Starting the computation of image shape')
training_images_resize_computation = executor.map(resize_images,
                                     [record[0] for record in training_images],
                                     [record[1] for record in training_images],
                                     [record[2] for record in training_images])
train_data = np.array(list(training_images_resize_computation))

Starting the computation of image shape
ThreadPoolExecutor-1_0: working on img num: 0
ThreadPoolExecutor-1_17: working on img num: 1000
ThreadPoolExecutor-1_44: working on img num: 2000
ThreadPoolExecutor-1_11: working on img num: 3000
ThreadPoolExecutor-1_57: working on img num: 4000
ThreadPoolExecutor-1_11: working on img num: 5000
ThreadPoolExecutor-1_37: working on img num: 5215


In [8]:
#train_data.shape
train_data = train_data[...,np.newaxis]
train_data.shape

(5216, 256, 256, 1)

In [9]:
label_encoder = LabelEncoder()
label_encoder.fit(training_data['label'])
train_labels = label_encoder.transform(training_data['label'])
train_labels.shape,type(train_labels),train_labels[0:4]

((5216,), numpy.ndarray, array([1, 0, 1, 1], dtype=int64))

In [10]:
validation_data = list()
for file in validation_dataset['filename']:
    image = cv2.imread(file)
    image = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image,dsize=(256,256),interpolation=cv2.INTER_CUBIC)
    image = np.array(image,dtype=np.float32)
    validation_data.append(image)
validation_data = np.array(validation_data)
validation_data = validation_data[...,np.newaxis]

label_encoder.fit(validation_dataset['label'])
validation_labels = label_encoder.transform(validation_dataset['label'])
validation_data.shape,type(validation_data),validation_labels.shape,type(validation_labels),validation_labels[0:4]

((16, 256, 256, 1),
 numpy.ndarray,
 (16,),
 numpy.ndarray,
 array([0, 0, 0, 1], dtype=int64))

In [15]:
input = tf.keras.layers.Input(shape = (256,256,1))

convolution_layer_1 = tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same')(input)
pooling_layer_1 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(convolution_layer_1)

convolution_layer_2 = tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same')(pooling_layer_1)
pooling_layer_2 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(convolution_layer_2)

convolution_layer_3 = tf.keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same')(pooling_layer_2)
pooling_layer_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))(convolution_layer_3)

flattened_output = tf.keras.layers.Flatten()(pooling_layer_3)

fully_connected_layer_1 = tf.keras.layers.Dense(512, activation='relu')(flattened_output)
dropout_1 = tf.keras.layers.Dropout(rate=0.3)(fully_connected_layer_1)

fully_connected_layer_2 = tf.keras.layers.Dense(512, activation='relu')(dropout_1)
dropout_2 = tf.keras.layers.Dropout(rate=0.3)(fully_connected_layer_2)

output = tf.keras.layers.Dense(1, activation='sigmoid')(dropout_2)

model = tf.keras.Model(inputs=input, outputs=output)
sgd = tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer='sgd',loss='binary_crossentropy',metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 256, 256, 1)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 256, 256, 32)      320       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 128, 128, 32)      0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 128, 128, 64)      18496     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 64, 64, 64)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 64, 64, 128)       73856     
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 32, 32, 128)       0         
__________

In [None]:
reduced_learing_rate = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,patience=2, min_lr=0.000001)
callbacks = [reduced_learing_rate]
history = model.fit(x=train_data, y=train_labels,batch_size=10,epochs=25,
                    validation_data=(validation_data, validation_labels), 
                    callbacks=callbacks,
                    verbose=1)

Train on 5216 samples, validate on 16 samples
Epoch 1/25
Epoch 2/25