In [2]:
import pandas as pd
import numpy as np
import os

import pickle
from datetime import datetime


# Visualizations
import seaborn as sns
import matplotlib.pyplot as pd


# preprocessing
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.image import rgb_to_grayscale


# Reshaping 
from tensorflow import reshape
from tensorflow.image import resize_with_pad


# Modelling 
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

# metrics
from tensorflow.keras.metrics import Recall, AUC


np.random.seed(42)

### Load Train Data From the Current Directory

In [2]:
# # Get current working directory
# PATH = os.getcwd() + '/../../src/data/chest_xray/'


# # importing training normal data
# norm_train_path = PATH+'train/NORMAL/'
# norm_train_batch = os.listdir(norm_train_path) 

# norm_train = []
# norm_errors = []
# for image_name in (norm_train_batch[:10]): 
#     img_path = norm_train_path + image_name 
#     try:
#         x = image.load_img(img_path) 
#         # preprocessing if required 
#         norm_train.append(x) 
#     except:
#         norm_errors.append(image_name)
    
    
# # importing training pnuemonia data
# pnue_train_path = PATH + 'train/PNEUMONIA/'
# pnue_train_batch = os.listdir(pnue_train_path)

# pnue_train = []
# pnue_errors = []
# for image_name in pnue_train_batch[:10]:
#     img_path = pnue_train_path + image_name
#     try:
#         x = image.load_img(img_path)
#         pnue_train.append(x)
#     except:
#         pnue_errors.append(image_name)
    


# Define an import function

In [17]:
def import_image(PATH, image_name):
    """
    PATH --> str: Relative path to image directoy
    image_name --> str: Name of the image to load
    
    Returns:
    PIL image
    
    """
    
    # create path to file
    img_path = PATH + "/" + image_name
    
    # load file and return pil
    return image.load_img(img_path) 

def grayscale_and_resize(PIL, shape=(256,256), padding=False, grayscale=True):
    """
    This is the preprocessing function that will take the raw jpeg, gray scale it, resize it and 
    turn it into an array
    
    
    PIL --> PIL object
    shape --> tuple: size of the final array
    padding --> bool: if True, will use tf.resize_with_pad
    """
    if padding:
        gray_image = rgb_to_grayscale(PIL)
        resized_image_arr = resize_with_pad(gray_image, target_height=shape[0], target_width=shape[1])
    else:
        if grayscale:
            resized_image_arr = img_to_array(PIL.convert(mode = 'L').resize(shape))
        else:
            resized_image_arr = img_to_array(PIL.resize(shape))
    
    return resized_image_arr


def import_image_to_array(
         RELPATH,
         dir_names = ['train', 'test', 'val'],
         sub_dir_names = ['NORMAL', 'PNEUMONIA'],
         padding=False,
         shape=(256,256),
         grayscale=True,
         test=False
):
    """
    This function loads all train, test and validation data into a dictionary of images
    
    Padding currently only returns a grayscale image.
    
    =====================================================================================
    RELPATH --> str: The relative path to the cwd to the directory containing image directories
    eg '../../src/data/chest_xray'
    =====================================================================================
    dir_names --> list, str: The names of the subdirectories containing the images
    eg ['train', 'test', 'val'] <-- default
    =====================================================================================
    sub_dir_names --> list -> str: names of the subdirectory containg postivie and negative cases
    eg ['NORMAL', 'PNEUMONIA'] <-- default
    =====================================================================================
    padding  --> bool: Whether you want the reshaping to be padded or not
    
    =====================================================================================
    shape --> tuple-> int: The final shape of the tensor array
    =====================================================================================  
    grayscale --> Bool: if True, images will be reduced to grayscale (x,x,1) else (x,x,3)
    
    returns
    
    dict --> str:list -> tuple -> (tf.array, bool)
    A dictionary where the keys are the dir_names and the values are lists containing tuple where 
    the first index is the tf.array and the second is a boolian, True if class is pnuemonia, false otherwise.
    """
    # test relative path works!! 
    PATH = os.getcwd() + RELPATH
    
    try:
        os.listdir(PATH)
        print("Your relative directory is good, proceeding to import files...", end="\n\n")
    except Exception as e:
        print(str(e))
        print(f"Your relative path directory is not pointing to the correct location. Double check your input \n")
        print("Terminating Program", end='\n')
        print("=======================================================================================")
        return False
    
    
    # instantiate a dict object and populate the keys
    image_dict = {}
    for name in dir_names:
        image_dict[name] = []
        
        print(f"Loading images from {name}", end='\n')
        
        
        # For each subdirectory, get all of the images and append to dictionary
        for sub_dir in sub_dir_names:
            subPATH = PATH + name + "/" + sub_dir
            # list of all image names in the subdirectory
            image_batch = os.listdir(subPATH)
            
            for image in image_batch:
                # import the image in pil format
                pil = import_image(subPATH, image)
                # gray scale and reshape the image turning it into an array
                gray_resized_pil = grayscale_and_resize(pil, shape=shape, padding=padding, grayscale=grayscale)
                
                # center the pixels
                centered_array = gray_resized_pil/255
                
                # append to the image_dict with class flag
                flag = 1
                if sub_dir == 'NORMAL':
                    flag = 0
                
                image_dict[name].append((image, centered_array, flag))
                
            
                # if this is just a test case, break out of this loop so we get one from each class
                if test == True:
                    break
            
            print(f"Finished loading images from {sub_dir}", end="\n")

        print()
    
    return image_dict               

In [18]:
images = import_image_to_array('/../../src/data/chest_xray/',test=False, grayscale=False, shape=(224,224))

Your relative directory is good, proceeding to import files...

Loading images from train
Finished loading images from NORMAL
Finished loading images from PNEUMONIA

Loading images from test
Finished loading images from NORMAL
Finished loading images from PNEUMONIA

Loading images from val
Finished loading images from NORMAL
Finished loading images from PNEUMONIA



In [24]:
images['test'][300][0]

'person109_bacteria_526.jpeg'

In [34]:
test_image_name = [i[0] for i in images['test']]
test_image_class = [i[2] for i in images['test']]
is_bacterial = [1 if i[0].find("bacteria") != -1 else 0 for i in images['test']]
test_df = pd.DataFrame([test_image_name, test_image_class, is_bacterial]).T
test_df.columns = ['image_name', 'pnuemonia', "bacterial"]

test_df

Unnamed: 0,image_name,pnuemonia,bacterial
0,IM-0031-0001.jpeg,0,0
1,IM-0025-0001.jpeg,0,0
2,NORMAL2-IM-0272-0001.jpeg,0,0
3,NORMAL2-IM-0102-0001.jpeg,0,0
4,NORMAL2-IM-0229-0001.jpeg,0,0
...,...,...,...
619,person120_bacteria_572.jpeg,1,1
620,person171_bacteria_826.jpeg,1,1
621,person109_bacteria_512.jpeg,1,1
622,person83_bacteria_410.jpeg,1,1


In [None]:
def 

Check for class imbalance in the data. 

In [131]:
y_train = np.array([i[1] for i in images['train']])
X_train = np.array([i[0] for i in images['train']])  


X_test = np.array([i[0] for i in images['test']])
y_test = np.array([i[1] for i in images['test']])


X_val = np.array([i[0] for i in images['val']])
y_val = np.array([i[1] for i in images['val']])

In [132]:
X_train[0].shape

(224, 224, 3)

In [7]:
# calculate inverse frequency of each class 
pnue_frequency = sum(y_train)/len(y_train)
inv_pnue_frequency = 1/pnue_frequency

normal_frequency = (len(y_train)-sum(y_train))/len(y_train)
inv_normal_frequency = 1/normal_frequency

# save weights as a dictionary to be used in the model
weights = {
    0: inv_normal_frequency,
    1: inv_pnue_frequency
}

# Construct Sequential model

In [26]:
cnn = models.Sequential()
adam = Adam()
recall = Recall()
AUC = AUC()
# Input layer conv
cnn.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256,  1)))
cnn.add(layers.MaxPooling2D((2, 2)))

# First hidden layer conv
cnn.add(layers.Conv2D(32, (3, 3), activation='relu'))
cnn.add(layers.MaxPooling2D((2, 2)))
cnn.add(layers.Dropout(0.2))
cnn.add(layers.Flatten())

# Added first/MASSIVE dense layer
cnn.add(layers.Dense(1024, activation='relu'))
cnn.add(layers.Dropout(.5))

# Add Second Layer dense 
cnn.add(layers.Dense(32, activation='relu'))
cnn.add(layers.Dropout(.1))

# Add third Layer dense
cnn.add(layers.Dense(16, activation='relu'))
cnn.add(layers.Dropout(.1))

# Added output layer 
cnn.add(layers.Dense(1, activation='sigmoid'))
cnn.compile(loss='binary_crossentropy',
              optimizer= adam,
              metrics=['acc', recall, AUC])

In [27]:
cnn1 = cnn.fit(X_train, y_train,
               epochs=10,
               batch_size=50,
               validation_data = (X_val, y_val), 
               class_weight=weights,
               verbose=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluation

In [107]:
hist_cnn = cnn.history
loss_values = hist_cnn['loss']
val_loss_values = hist_cnn['val_loss']
auc_values = hist_cnn['auc'] 
val_auc_values = hist_cnn['val_auc']
acc_values = hist_cnn['acc'] 
val_acc_values = hist_cnn['val_acc']
recall_values = hist_cnn['recall'] 
val_recall_values = hist_cnn['val_recall']
epochs = range(1, len(loss_values) + 1)


plt.subplots(2,2, figsize=(15, 12))
plt.subplot(121)
plt.plot(epochs, loss_values, 'g.', label='Training loss')
plt.plot(epochs, val_loss_values, 'g', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()


plt.subplot(122)
plt.plot(epochs, auc_values, 'r.', label='Training auc')
plt.plot(epochs, val_auc_values, 'r', label='Validation auc')
plt.title('Training and validation AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')


plt.subplot(221)
plt.plot(epochs, acc_values, 'r.', label='Training Accuracy')
plt.plot(epochs, val_acc_values, 'r', label='Validation Accuracy')
plt.title('Training and validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')


plt.subplot(222)
plt.plot(epochs, recall_values, 'r.', label='Training Recall')
plt.plot(epochs, val_recall_values, 'r', label='Validation Recall')
plt.title('Training and validation Recall')
plt.xlabel('Epochs')
plt.ylabel('Recall')
plt.legend()
plt.show()

TypeError: 'History' object is not subscriptable

In [96]:
def get_labels(fit_model, X, threshold):
    return [1 if x >= threshold else 0 for x in fit_model.predict(X)]

In [99]:
def get_false_positive(true, prediction):
    return [1 if (x == 0 and y == 1) else 0 for x,y in zip(true,prediction)]

In [103]:
def get_true_positive(true, prediction):
    return [1 if (x == 1 and y == 1) else 0 for x,y in zip(true,prediction)]

In [105]:
predictions = get_labels(cnn1, X_test, 0.5)
fp = get_false_positive(y_test, predictions)
tp = get_true_positive(y_test, predictions)

In [102]:
sum(fp)

168

In [28]:
validation1_1 = cnn.evaluate(X_test, y_test)



In [294]:
# validation1_3 = cnn.evaluate(X_test, y_test)

In [12]:
validation1_ = cnn.evaluate(X_train, y_train)



In [20]:
today = str(datetime.today()).split()[0]
directory = "../../src/models/"
model_id = "tim-2"
file = directory+today+model_id+".HDF5"
cnn.save(file)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ../../src/models/2020-12-02tim-1.HDF5/assets


In [72]:
cnn1 = models.load_model("../../src/models/2020-12-02tim-1.HDF5/")

cnn1.evaluate(X_test, y_test)



[0.8882023692131042, 0.7307692170143127, 1.0, 0.9111714363098145]

In [75]:
cnn1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 254, 254, 64)      640       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 127, 127, 64)      0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 125, 125, 32)      18464     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 62, 62, 32)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 62, 62, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 123008)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)               

In [301]:
cnn.evaluate(X_test, y_test)



[1.7555465698242188,
 0.7788461446762085,
 0.9871794581413269,
 0.8070019483566284]

In [302]:
cnn.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_46 (Conv2D)           (None, 254, 254, 64)      640       
_________________________________________________________________
max_pooling2d_39 (MaxPooling (None, 127, 127, 64)      0         
_________________________________________________________________
conv2d_47 (Conv2D)           (None, 125, 125, 32)      18464     
_________________________________________________________________
max_pooling2d_40 (MaxPooling (None, 62, 62, 32)        0         
_________________________________________________________________
flatten_17 (Flatten)         (None, 123008)            0         
_________________________________________________________________
dense_50 (Dense)             (None, 32)                3936288   
_________________________________________________________________
dropout_39 (Dropout)         (None, 32)              

# Best Model

In [None]:
cnn = models.Sequential()
adam = Adam()
recall = Recall()
AUC = AUC()

# Input layer conv
cnn.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256,  1)))
cnn.add(layers.MaxPooling2D((2, 2)))

# First hidden layer conv
cnn.add(layers.Conv2D(32, (3, 3), activation='relu'))
cnn.add(layers.MaxPooling2D((2, 2)))
cnn.add(layers.Dropout(0.2))
cnn.add(layers.Flatten())

# Added first dense layer
cnn.add(layers.Dense(32, activation='relu'))
cnn.add(layers.Dropout(.2))

# Add Second Layer
cnn.add(layers.Dense(16, activation='relu'))
cnn.add(layers.Dropout(.1))
cnn.add(layers.Dense(1, activation='sigmoid'))
cnn.compile(loss='binary_crossentropy',
              optimizer= adam,
              metrics=['acc', recall, AUC])

cnn1 = cnn.fit(X_train, y_train,
               epochs=7,
               batch_size=50,
               validation_data = (X_val, y_val), 
               class_weight=weights,
               verbose=True)

# Second model

I added a second hidden layer and I added dropout = 0.1 to the input layer

In [291]:
cnn2 = models.Sequential()
adam2 = Adam()
recall = Recall()
AUC = AUC()

# Input Layer conv
cnn2.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256,1)))
cnn2.add(layers.MaxPooling2D((2, 2)))
cnn2.add(layers.Dropout(0.1))

# First hidden Layer conv
cnn2.add(layers.Conv2D(32, (3, 3), activation='relu'))
cnn2.add(layers.MaxPooling2D((2, 2)))
cnn2.add(layers.Dropout(0.5))

# Second hidden Layer conv
cnn2.add(layers.Conv2D(16, (3, 3), activation='relu'))
cnn2.add(layers.MaxPooling2D((2, 2)))
cnn2.add(layers.Dropout(0.5))
cnn2.add(layers.Flatten())

# Added first Layer dense
cnn2.add(layers.Dense(32, activation='relu'))
cnn2.add(layers.Dropout(.5))

# Add Second Layer dense
cnn2.add(layers.Dense(16, activation='relu'))
cnn2.add(layers.Dropout(.5))

# Output Layer dense
cnn2.add(layers.Dense(1, activation='sigmoid'))
cnn2.compile(loss='binary_crossentropy',
              optimizer= adam2,
              metrics=['acc', recall, AUC])

history2 = cnn2.fit(X_train, y_train,
               epochs=5,
               batch_size=50,
               validation_data = (X_test, y_test), 
               class_weight=weights,
               verbose=True)

Epoch 1/5
  1/105 [..............................] - ETA: 0s - loss: 1.4348 - acc: 0.5000 - recall_13: 0.5135 - auc_3: 0.4553

KeyboardInterrupt: 

In [None]:
today = str(datetime.today()).split()[0]
directory = "../../src/models/"
model_id = "tim-2"
file = directory+today+model_number+".HDF5"
cnn2.save(file)

In [None]:
validation2_ = cnn2.evaluate(X_val, y_val)

In [None]:
validation2_2 = cnn2.evaluate(X_test, y_test)

In [None]:
validation2_3 = cnn2.evaluate(X_train, y_train)

In [None]:
cnn3 = models.Sequential()
adam3 = Adam()
recall3 = Recall()
AUC3 = AUC()
# Input Layer conv
cnn3.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256,1)))
cnn3.add(layers.MaxPooling2D((2, 2)))
cnn3.add(layers.Dropout(0.1))

# First hidden Layer conv
cnn3.add(layers.Conv2D(32, (3, 3), activation='relu'))
cnn3.add(layers.MaxPooling2D((2, 2)))
cnn3.add(layers.Dropout(0.5))

# Second hidden Layer conv
cnn3.add(layers.Conv2D(16, (3, 3), activation='relu'))
cnn3.add(layers.MaxPooling2D((2, 2)))
cnn3.add(layers.Dropout(0.5))
cnn3.add(layers.Flatten())

# Added first Layer dense
cnn3.add(layers.Dense(32, activation='relu'))
cnn3.add(layers.Dropout(.5))

# Add Second Layer dense
cnn3.add(layers.Dense(16, activation='relu'))
cnn3.add(layers.Dropout(.5))

# Output Layer dense
cnn3.add(layers.Dense(1, activation='sigmoid'))
cnn3.compile(loss='binary_crossentropy',
              optimizer= adam2,
              metrics=['acc', recall3, AUC3])

history3 = cnn3.fit(X_train, y_train,
               epochs=10,
               batch_size=50,
               validation_data = (X_val, y_val), 
               class_weight=weights,
               verbose=True)

In [None]:
validation3_1 = cnn3.evaluate(X_val, y_val)

In [None]:
validation3_2 = cnn3.evaluate(X_test, y_test)

In [None]:
validation3_3 = cnn3.evaluate(X_train, y_train)

In [None]:
today = str(datetime.today()).split()[0]
directory = "../../src/models/"
model_id = "tim-3"
file = directory+today+model_number+model_id+".HDF5"
cnn3.save(file)