In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import random

import os
from os import path

import matplotlib.pyplot as plt

import glob
from PIL import Image

# Loading images paths

In [2]:
def get_images_path_list(dir_path):
    path_list = glob.glob("{0}/*.jpeg".format(dir_path))
    return path_list


ROOT_DIR = '/kaggle/input/chest-xray-pneumonia/chest_xray/chest_xray'

train_normal = get_images_path_list(path.join(ROOT_DIR, 'train', 'NORMAL'))
print("len(train_normal): ", len(train_normal))
train_pneumonia = get_images_path_list(path.join(ROOT_DIR, 'train', 'PNEUMONIA'))
print("len(train_pneumonia): ", len(train_pneumonia))
#balance train
train_normal = random.sample(train_normal, min(len(train_normal), len(train_pneumonia)))
train_pneumonia = random.sample(train_pneumonia, min(len(train_normal), len(train_pneumonia)))
print("len(train_normal) balanced: ", len(train_normal))
print("len(train_pneumonia) balanced: ", len(train_pneumonia))

test_normal = get_images_path_list(path.join(ROOT_DIR, 'test', 'NORMAL'))
print("len(test_normal): ", len(test_normal))
test_pneumonia = get_images_path_list(path.join(ROOT_DIR, 'test', 'PNEUMONIA'))
print("len(test_pneumonia): ", len(test_pneumonia))
#balance test
test_normal = random.sample(test_normal, min(len(test_normal), len(test_pneumonia)))
test_pneumonia = random.sample(test_pneumonia, min(len(test_normal), len(test_pneumonia)))
print("len(test_normal) balanced: ", len(test_normal))
print("len(test_pneumonia) balanced: ", len(test_pneumonia))

val_normal = get_images_path_list(path.join(ROOT_DIR, 'val', 'NORMAL'))
print("len(val_normal): ", len(val_normal))
val_pneumonia = get_images_path_list(path.join(ROOT_DIR, 'val', 'PNEUMONIA'))
print("len(val_pneumonia): ", len(val_normal))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing TensorFlow and Keras

In [3]:
import tensorflow as tf
print(tf.__version__)


from keras.models import Sequential, Model
from keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Input

# Helper Functions

In [4]:
# Source https://www.tensorflow.org/tutorials/load_data/tfrecord

# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(
        bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()])
    )

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

# Creating TFRecord

In [5]:
tfrecords_dir = "tfrecords_dir"

if not os.path.exists(tfrecords_dir):
    os.makedirs(tfrecords_dir)  # creating TFRecords output folder
    print("{0} directory created to save TFRecords".format(tfrecords_dir))

In [6]:
def create_example(image, label):
    label = 0 if label == "normal" else 1
    feature = { 'label': _int64_feature(label),
              'image': _image_feature(image) }
    return tf.train.Example(features=tf.train.Features(feature=feature))

In [9]:
#serialize images from set (train, test, val) with their labels to a TFRecord file.
def write_serialized(imgs_labels, set_name):
    # Initiating the writer and creating the tfrecord file.
    tfrecord_filename = path.join(tfrecords_dir, "file_{0}.tfrec".format(set_name))
    total_written = 0
    with tf.io.TFRecordWriter(tfrecord_filename) as writer:
        for group in imgs_labels:# each group represents a class
            imgs_paths = group[0]
            label = group[1]
            for i in range(len(imgs_paths)):
                img_path = imgs_paths[i]
                img = tf.io.decode_jpeg(tf.io.read_file(img_path))

                #create example from features
                example = create_example(img, label)
                #write example to tfrecord
                writer.write(example.SerializeToString())
                total_written += 1
                
    print("Written {0} examples to {1}".format(total_written, tfrecord_filename))

In [10]:
#Write train data to TFRecord
imgs_labels = [
    [train_pneumonia, "pneumonia"],
    [train_normal, "normal"]
]
write_serialized(imgs_labels, "train")

#Write test data to tf_record
imgs_labels = [
    [test_pneumonia, "pneumonia"],
    [test_normal, "normal"]
]
write_serialized(imgs_labels, "test")

#Write val data to tf_record
imgs_labels = [
    [val_pneumonia, "pneumonia"],
    [val_normal, "normal"]
]
write_serialized(imgs_labels, "val")

In [12]:
#parse example [image + label] from TFRecord
def parse_tfrecord_fn(example):
    feature_description = {
        "label": tf.io.FixedLenFeature([], tf.int64),
        "image": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, feature_description)
    example["image"] = tf.io.decode_jpeg(example["image"], channels=3)
    return example

In [13]:
#Explore one sample from the generated TFRecord
raw_dataset = tf.data.TFRecordDataset(path.join(tfrecords_dir, "file_train.tfrec"))
print("Dataset size: ", len(list(raw_dataset)))
parsed_dataset = raw_dataset.map(parse_tfrecord_fn)


for features in parsed_dataset.take(1):
    print("label: {0}".format("Normal" if features["label"] == 0 else "Pneumonia"))
    print(f"Image shape: {features['image'].shape}")
    plt.figure(figsize=(7, 7))
    plt.imshow(features["image"].numpy())
    plt.show()

# Train a simple model using the generated TFRecords

In [52]:
#resize image to the correct input shape expected by the ML model
def prepare_sample(features):
    image = tf.image.resize(features["image"], size=(224, 224))
    return image, features["label"]


def get_dataset(filenames, batch_size):
    dataset = (#create TFRecordDataset from images of passed filenames
        tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE) # parse TFRecord from each
        .map(prepare_sample, num_parallel_calls=AUTOTUNE) # preprocessing the image in each TFRecord
        .shuffle(batch_size * 100) # shuffle dataset. THIS IS VERY IMPORTANT TO AVOID OVERFITTING and RANDOM MODEL
        .batch(batch_size) # get requested batch size
        .prefetch(AUTOTUNE) # Make sure that there is always 1 batch ready for the GPU prefetched by the CPU
        # batch prefetch Stackoverflow https://stackoverflow.com/a/67361329/11292753
    )
    return dataset

# get pretrained VGG16 model using imagenet weights
def load_vgg16_pretrained_model(input_shape):
    
    base_model = tf.keras.applications.VGG16(weights='imagenet', input_shape=input_shape)
    transfer_layer = base_model.get_layer('block5_pool')
    pretrained_model = Model(inputs = base_model.input, outputs = transfer_layer.output)
    
    # consider which layers you want to freeze before you compile 
    for i in range(17):
        pretrained_model.layers[i].trainable = False

# get pretrained Xception model using imagenet weights
def load_xception_pretrained_model(input_shape):
    
    base_model = tf.keras.applications.Xception(weights='imagenet',
                                                include_top=False,
                                                input_shape=input_shape)
    
    
    # freeze all layers but last 10
    for i in range(len(base_model.layers) - 10):
         base_model.layers[i].trainable = False
    return base_model

#build our model based on pretrained model
def make_model(exponential_decay, print_summary = False):
    
    input_shape = (224, 224, 3)

    base_model = load_xception_pretrained_model(input_shape)
    x = base_model.output
    x = Flatten()(x) #flatten all the output to FC layer
    x = Dense(512, activation='relu')(x) # Relu Dense of 512 units
    x = Dropout(0.3)(x) # use Dropout to decrease overfitting
    x = Dense(256, activation='relu')(x)
    predictions = Dense(1, activation='sigmoid')(x) # Sigmoid activation: [0,1]
    model = Model(inputs=base_model.inputs, outputs=predictions)
        
    #use exponential decay to have different learning weights for trainable layers from
    # the pretrained model and ones we added on top.
    initial_learning_rate = 0.01
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate, decay_steps=20, decay_rate=0.96, staircase=True
    )

    loss = 'binary_crossentropy'  #either pneumonia or normal
    metrics = ['binary_accuracy']
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5 if not exponential_decay else lr_schedule),
        loss=loss,
        metrics=metrics,
    )
    
    if print_summary:
        model.summary()

    return model

In [53]:
#create checkboint and earlystopping to use by history
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    "pneumonia_model.h5", save_best_only=True, monitor= 'val_loss'
)

early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=100, restore_best_weights=True, monitor= 'val_loss'
)

model = make_model(exponential_decay = True, print_summary=False)

# Train our model

In [54]:
train_filenames = tf.io.gfile.glob(f"{tfrecords_dir}/file_train.tfrec")
val_filenames = tf.io.gfile.glob(f"{tfrecords_dir}/file_val.tfrec")

batch_size = 32
epochs = 50
steps_per_epoch = 15

# From Stackover: tf.data builds a performance model of the input pipeline
# and runs an optimization algorithm to find a good allocation of its CPU 
# budget across all parameters specified as AUTOTUNE
# Stackover flow https://stackoverflow.com/a/59493168/11292753
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = get_dataset(train_filenames, batch_size)
val_dataset = get_dataset(val_filenames, 8)

# debugging useful
# image_batch = next(iter(train_dataset))
# imgs_tensor = tf.constant(image_batch[0])
# labels_tensor=tf.constant(image_batch[1])

history = model.fit(
    train_dataset,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_dataset,
    #callbacks=[checkpoint_cb, early_stopping_cb],
    verbose=1,
)

In [55]:
def plot_history_acc(history, loss_metric_type):
    
    N = len(history.history["loss"])
    plt.style.use("ggplot")
    plt.figure()
    if loss_metric_type == "binary":
        plt.plot(np.arange(0, N), history.history["binary_accuracy"], label="train_acc")
        plt.plot(np.arange(0, N), history.history["val_binary_accuracy"], label="val_acc")
    else:
        plt.plot(np.arange(0, N), history.history["categorical_accuracy"], label="train_acc")
        plt.plot(np.arange(0, N), history.history["val_categorical_accuracy"], label="val_acc")
        
    plt.title("Training Accuracy on Dataset")
    plt.xlabel("Epoch #")
    plt.ylabel("Accuracy")
    plt.legend(loc="lower left")
    
    return

def plot_history_loss(history):
    
    N = len(history.history["loss"])
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0, N), history.history["loss"], label="train_loss")
    plt.plot(np.arange(0, N), history.history["val_loss"], label="val_loss")
        
    plt.title("Training Loss on Dataset")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss")
    plt.legend(loc="lower left")
    
    return

In [56]:
#plot epoch-accuracy chart
loss_metric_type = 'binary' #'binary'
plot_history_acc(history, loss_metric_type)

In [57]:
#plot epoch-loss chart
plot_history_loss(history)

# Evaluate model

In [64]:
test_filenames = tf.io.gfile.glob(f"{tfrecords_dir}/file_test.tfrec")
test_dataset = get_dataset(test_filenames, batch_size)

model.evaluate(test_dataset)

# Prediction

In [63]:
def show_batch_predictions(image_batch):
    
    #get rid of Eager Tensor to be able to iterate
    img_tensor=tf.constant(image_batch[0])
    labels_tensor=tf.constant(image_batch[1])
    
    # debugging: print correct labels: 1= pneumonia, 0 = normal
    print("correct labels:", labels_tensor)
    
    plt.figure(figsize=(14, 14))
    for n in range(32):
        ax = plt.subplot(8, 4, n + 1)
        plt.imshow(img_tensor[n] / 255.0)
        img_array = tf.expand_dims(img_tensor[n], axis=0)
        #print( model.predict(img_array)[0])
        num_predicted_label = model.predict(img_array)[0]
        
        # debugging
        #print(num_predicted_label)
        
        # from TF docs:
        # Sigmoid is equivalent to a 2-element Softmax, where the second element is assumed to be zero.
        # The sigmoid function always returns a value between 0 and 1.

        str_label = "normal" if num_predicted_label < 0.5 else "pneumonia"
        correct = "T" if num_predicted_label == labels_tensor[n] else "F"
        plt.title("{0} - {1}".format(str_label, correct))
        plt.axis("off")


image_batch = next(iter(test_dataset))
show_batch_predictions(image_batch)

# Save model

In [60]:
export_dir= "export_dir"

if not os.path.exists(export_dir):
    os.makedirs(export_dir)  # creating TFRecords output folder
    print("{0} directory created to save exported model".format(export_dir))

export_path = path.join("export_dir", "pneumonia.pb")

tf.keras.models.save_model(
    model,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None
)

print('\nSaved model')

# Download model

In [None]:
from IPython.display import FileLink

!zip -r model.zip export_dir
FileLink(r'model.zip')