In [1]:
import os
import pre_processing_utils
import numpy as np
import pandas as pd
import tensorflow as tf
import time, sys
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
from tensorflow.keras.metrics import AUC as auc
from multiprocessing import Pool
from IPython.display import clear_output

<h2>Data Pre-Processing</h2>

In [2]:
pre_processing_utils.check_data_exists()

test data location = /home/georgeridgway/GitProjects/DL-Project/chest_xray/test
training data location = /home/georgeridgway/GitProjects/DL-Project/chest_xray/train
validation data location = /home/georgeridgway/GitProjects/DL-Project/chest_xray/val


In [3]:
# A smiple progress bar to incidate how much work has been copleted during long run-times
def update_progress(progress, description):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = description + " [{0}] {1:.1f}%".format("#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [4]:
def perform_pre_processing_tasks(tasks, description):
    update_progress(0, description)
    work_done = 0
    total_work = len(tasks)
    for task in tasks:
        pre_processing_utils.apply_transformation_to_folder(task, -1)
        work_done += 1
        update_progress(work_done / total_work, description)
    update_progress(1, description)

In [5]:
def equalize_data_with_rand_flips(root_path):
    normal_data_path = root_path + pre_processing_utils.normal_path
    normal_files = pre_processing_utils.get_num_files(normal_data_path)

    pneumonia_data_path = root_path + pre_processing_utils.pneumonia_path
    pneumonia_files = pre_processing_utils.get_num_files(pneumonia_data_path)

    diff = abs(normal_files - pneumonia_files)
    print(diff)
    if normal_files < pneumonia_files:
        task = [root_path, pre_processing_utils.normal_path, pre_processing_utils.equalize_data_transformation]
    else:
        task = [root_path, pre_processing_utils.pneumonia_path, pre_processing_utils.equalize_data_transformation]
    pre_processing_utils.apply_transformation_to_folder(task, diff)


In [6]:
def augment_training_data():
    root_data_path = pre_processing_utils.test_path
    # Defining the data paths for the given root dir
    normal_data_path = root_data_path + pre_processing_utils.normal_path
    pneumonia_data_path = root_data_path + pre_processing_utils.pneumonia_path
    # Equalizing the data before applying transformations
    if pre_processing_utils.get_num_files(normal_data_path) != pre_processing_utils.get_num_files(pneumonia_data_path):
        equalize_data_with_rand_flips(root_data_path)
        perform_pre_processing_tasks(pre_processing_utils.get_flip_images_tasks(root_data_path), 'Flipping Images Progress:')
        perform_pre_processing_tasks(pre_processing_utils.get_rotate_images_tasks(root_data_path), 'Rotating Images Progress:')
        perform_pre_processing_tasks(pre_processing_utils.get_scale_images_tasks(root_data_path), 'Scaling Images Progress:')
        perform_pre_processing_tasks(pre_processing_utils.get_translation_image_tasks(root_data_path), 'Translating Images Progress')
    perform_pre_processing_tasks(pre_processing_utils.get_noise_image_tasks(root_data_path), 'Adding Noise to Images Progress')


In [7]:
augment_training_data()

Adding Noise to Images Progress [####################] 100.0%


In [16]:
# We're going to have images of different dimensions, so handle that before using the CNN
image_shape = (300, 300, 3)

In [4]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
image_gen.flow_from_directory(pre_processing_utils.train)

KeyboardInterrupt: 

In [22]:
image_gen.flow_from_directory(pre_processing_utils.test_path)

NameError: name 'image_gen' is not defined

In [25]:
image_gen.flow_from_directory(pre_processing_utils.validation_path)

NameError: name 'image_gen' is not defined

<h2>Building a Model</h2>

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Dropout, Flatten
from tensorflow.keras.metrics import Accuracy, AUC

In [33]:
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=(3,3), input_shape=image_shape, activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))

# model.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu'))
# model.add(MaxPool2D(pool_size=(2,2)))

# model.add(Conv2D(filters=256, kernel_size=(3,3), activation='relu'))
# model.add(MaxPool2D(pool_size=(2,2)))

# model.add(Conv2D(filters=512, kernel_size=(3,3), activation='relu'))
# model.add(MaxPool2D(pool_size=(2,2)))

model.add(Flatten())

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam',
             metrics=[AUC()])

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 298, 298, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 149, 149, 32)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 147, 147, 64)      18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 73, 73, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 71, 71, 64)        36928     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 35, 35, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 78400)             0

<h2>Regularization</h2>

In [39]:
from tensorflow.keras.callbacks import EarlyStopping

In [42]:
early_stop = EarlyStopping(monitor='val_loss', patience=2)
batch_size = 64

In [44]:
train_image_gen = image_gen.flow_from_directory(train_path,
                                               target_size = image_shape[:2],
                                               batch_size = batch_size,
                                               class_mode = 'binary')

NameError: name 'image_gen' is not defined

In [46]:
test_image_gen = image_gen.flow_from_directory(test_path,
                                               target_size=image_shape[:2],
                                               batch_size = batch_size,
                                               class_mode='binary',
                                               shuffle=False) # Don't want to shuffle test data and lose labels

NameError: name 'image_gen' is not defined

In [None]:
test_image_gen.class_indices

<h3>Running the Model</h3>

In [107]:
# Running without early stop for now - not enough examples for 'val_loss'?
results = model.fit(train_image_gen, epochs=20,
                             validation_data=test_image_gen,
                    callbacks=[early_stop]
                   )

NameError: name 'train_image_gen' is not defined

In [108]:
metrics = pd.DataFrame(model.history.history)

AttributeError: 'Sequential' object has no attribute 'history'

In [109]:
metrics[['accuracy', 'val_accuracy']].plot()

NameError: name 'metrics' is not defined

In [110]:
metrics[['auc', 'val_auc']].plot()

NameError: name 'metrics' is not defined

In [111]:
metrics[['loss', 'val_loss']].plot()

NameError: name 'metrics' is not defined

<p>We see that the model is overfitting significantly.</p>

In [112]:
model.save('image_classifier_accuracy.h5')