In [1]:

import random
import cv2
from keras import backend as K
from keras.preprocessing import image
from sklearn.metrics import roc_auc_score, roc_curve
from tensorflow.compat.v1.logging import INFO, set_verbosity

random.seed(a=None, version=2)

set_verbosity(INFO)



def load_image(img, image_dir, df, preprocess=True, H=320, W=320):
    """Load and preprocess image."""
    img_path = image_dir + img
    mean, std = get_mean_std_per_batch(img_path, df, H=H, W=W)
    x = image.load_img(img_path, target_size=(H, W))
    if preprocess:
        x -= mean
        x /= std
        x = np.expand_dims(x, axis=0)
    return x




caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:

!pip install -q efficientnet
import efficientnet.tfkeras as efn


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

from keras.preprocessing.image import ImageDataGenerator
from keras.applications.densenet import DenseNet121
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model

from keras.models import load_model


# from tensorflow.keras.applications import DenseNet121
import tensorflow as tf
import tensorflow.keras.layers as L
# import tensorflow.keras.layers as Layers

[0m

In [3]:
from tensorflow.keras.optimizers import Adam

In [4]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


<a name='2'></a>
## 2. Load the Datasets

For this assignment, we will be using the [ChestX-ray8 dataset](https://arxiv.org/abs/1705.02315) which contains 108,948 frontal-view X-ray images of 32,717 unique patients. 
- Each image in the data set contains multiple text-mined labels identifying 14 different pathological conditions. 
- These in turn can be used by physicians to diagnose 8 different diseases. 
- We will use this data to develop a single model that will provide binary classification predictions for each of the 14 labeled pathologies. 
- In other words it will predict 'positive' or 'negative' for each of the pathologies.
 
This dataset has been annotated by consensus among four different radiologists for 5 of our 14 pathologies:
- `Consolidation`
- `Edema`
- `Effusion`
- `Cardiomegaly`
- `Atelectasis`

In [None]:
train_df_main = pd.read_csv('../train_df.csv')
# valid_df = pd.read_csv("nih/valid-small.csv")
# test_df = pd.read_csv("nih/test.csv")
labels = train_df_main.columns[2:-4]
labels

In [7]:
from sklearn.model_selection import train_test_split
train_df, discard = train_test_split(train_df_main, test_size = 0.7, random_state = 1993)

train_and_valid_set, test_set = train_test_split(train_df, test_size = 0.2, random_state = 1993)
train_set, valid_set = train_test_split(train_and_valid_set, test_size = 0.2, random_state = 1993)

<a name='2-2'></a>
### 2.2 Preparing Images
With our dataset splits ready, we can now proceed with setting up our model to consume them. 
- For this we will use the off-the-shelf [ImageDataGenerator](https://keras.io/preprocessing/image/) class from the Keras framework, which allows us to build a "generator" for images specified in a dataframe. 
- This class also provides support for basic data augmentation such as random horizontal flipping of images.
- We also use the generator to transform the values in each batch so that their mean is $0$ and their standard deviation is 1. 
    - This will facilitate model training by standardizing the input distribution. 
- The generator also converts our single channel X-ray images (gray-scale) to a three-channel format by repeating the values in the image across all channels.
    - We will want this because the pre-trained model that we'll use requires three-channel inputs.

Since it is mainly a matter of reading and understanding Keras documentation, we have implemented the generator for you. There are a few things to note: 
1. We normalize the mean and standard deviation of the data
3. We shuffle the input after each epoch.
4. We set the image size to be **320px by 320px

In [9]:
def get_train_generator(df, image_dir, x_col, y_cols, shuffle=True, batch_size=8, seed=1, target_w = 320, target_h = 320):
    
    print("getting train generator...")
    # normalize images
    image_generator = ImageDataGenerator(
        samplewise_center=True,
        samplewise_std_normalization= True, 
        shear_range=0.1,
        zoom_range=0.15,
        rotation_range=5,
        width_shift_range=0.1,
        height_shift_range=0.05,
        horizontal_flip=True, 
        vertical_flip = False, 
        rescale=1.0/255.0,
        fill_mode = 'reflect')
    
    
    # flow from directory with specified batch size
    # and target image size
    generator = image_generator.flow_from_dataframe(
            dataframe=df,
            directory=None,
            x_col=x_col,
            y_col=y_cols,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=shuffle,
            seed=seed,
            target_size=(target_w,target_h))
    
    return generator



In [10]:
def get_test_and_valid_generator(valid_df, test_df, train_df, image_dir, x_col, y_cols, sample_size=100, batch_size=8, seed=1, target_w = 320, target_h = 320):

    # get generator to sample dataset
    raw_train_generator = ImageDataGenerator().flow_from_dataframe(
        dataframe=train_df, 
        directory=image_dir, 
        x_col="FilePath", 
        y_col=labels, 
        class_mode="raw", 
        batch_size=sample_size, 
        shuffle=True, 
        target_size=(target_w, target_h))
    
    # get data sample
    batch = raw_train_generator.next()
    data_sample = batch[0]

    # use sample to fit mean and std for test set generator
    image_generator = ImageDataGenerator(
        featurewise_center=True,
        featurewise_std_normalization= True)
    
    # fit generator to sample from training data
    image_generator.fit(data_sample)

    # get test generator
    valid_generator = image_generator.flow_from_dataframe(
            dataframe=valid_df,
            directory=image_dir,
            x_col=x_col,
            y_col=y_cols,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=False,
            seed=seed,
            target_size=(target_w,target_h))

    test_generator = image_generator.flow_from_dataframe(
            dataframe=test_df,
            directory=image_dir,
            x_col=x_col,
            y_col=y_cols,
            class_mode="raw",
            batch_size=batch_size,
            shuffle=False,
            seed=seed,
            target_size=(target_w,target_h))
    return valid_generator, test_generator

In [11]:
train_generator = get_train_generator(df = train_set,
                                      image_dir = None, 
                                      x_col = "FilePath",
                                      y_cols = labels, 
                                      batch_size=BATCH_SIZE,
                                      target_w = IMAGE_SIZE[0], 
                                      target_h = IMAGE_SIZE[1] 
                                      )

valid_generator, test_generator= get_test_and_valid_generator(valid_df = valid_set, 
                                                              test_df = test_set, 
                                                              train_df = train_set,
                                                              image_dir = None, 
                                                              x_col = "FilePath", 
                                                              y_cols = labels,
                                                              batch_size = BATCH_SIZE,
                                                              target_w = IMAGE_SIZE[0], 
                                                              target_h = IMAGE_SIZE[1])

getting train generator...
Found 21476 validated image filenames.
getting train and valid generators...
Found 21476 validated image filenames.
Found 5370 validated image filenames.
Found 6712 validated image filenames.


In [None]:

def get_label(y):
    """
    Returns the appended label list of the given set. 
    
    y(list) the one hot vector list containing the label encoding. 
    """
    ret_labels = []
    i = 0
    for idx in y:
        if idx:
            ret_labels.append(labels[i])
        i += 1
    if not ret_labels:
        return 'No Label'
    else:
        return '|'.join(ret_labels)

#get one batch of images from the imageset    
x, y = train_generator.__getitem__(0)

<a name=''>

In [17]:
# with strategy.scope():
#     dnet121 = DenseNet121(input_shape=(*IMAGE_SIZE, 3),
#                           weights='imagenet',
#                           include_top=False )
#     dnet121.trainable = True

#     model_dnet121 = tf.keras.Sequential([ dnet121, 
#                                          Layers.GlobalAveragePooling2D(), 
#                                          Layers.Dense(len(labels), activation ='sigmoid') ])

#     model_dnet121.compile(optimizer='adam',
#                            loss = get_weighted_loss(pos_weights, neg_weights), 
#                            metrics = ['accuracy'] )`

#     model_dnet121.summary()

# history = model_dnet121.fit_generator(train_generator, 
#                               validation_data=valid_generator,
#                               steps_per_epoch=100, 
#                               validation_steps=25, 
#                               epochs = 3)

In [18]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, ReLU, Concatenate, GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model


In [19]:
with strategy.scope():
  
    def conv_block(x, growth_rate):
        x1 = BatchNormalization()(x)
        x1 = ReLU()(x1)
        x1 = Conv2D(filters=growth_rate, kernel_size=(3, 3), padding='same')(x1)
        x = Concatenate()([x, x1])
        return x

    def dense_block(x, num_layers, growth_rate):
        for _ in range(num_layers):
            x = conv_block(x, growth_rate)
        return x

    def transition_block(x, reduction):
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(int(tf.keras.backend.int_shape(x)[-1] * reduction), kernel_size=(1, 1), padding='same')(x)
        x = tf.keras.layers.AveragePooling2D((2, 2), strides=(2, 2))(x)
        return x

    def CustomNet121(input_shape=(224, 224, 3), num_classes=1000, growth_rate=32, num_blocks=[6, 12, 24, 16], reduction=0.5):
        inputs = Input(shape=input_shape)
        x = Conv2D(64, kernel_size=(7, 7), strides=(2, 2), padding='same')(inputs)
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = tf.keras.layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x)

        num_features = 64
        for i, num_layers in enumerate(num_blocks):
            x = dense_block(x, num_layers, growth_rate)
            num_features += num_layers * growth_rate
            if i != len(num_blocks) - 1:
                x = transition_block(x, reduction)

        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = GlobalAveragePooling2D()(x)
        x = Dense(num_classes, activation='softmax')(x)

        model = Model(inputs, x, name='CustomNet-121')
        return model
    model = CustomNet121(input_shape = (224,224,3),num_classes=len(labels))
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint callback
checkpoint = ModelCheckpoint(
    'best_weights.h5',
    monitor='val_accuracy',  # Metric to monitor
    verbose=1,
    save_best_only=True,
    mode='max'  # Mode of the monitored metric (e.g., max for accuracy, min for loss)
)
    
    
model.compile(
    optimizer=tf.keras.optimizers.Adam( learning_rate=1e-4, amsgrad=False), 
    loss = 'binary_crossentropy',
    metrics = ['binary_accuracy']
)
#model.summary()


In [22]:
checkpoint = ModelCheckpoint(
    monitor='val_accuracy',  # Metric to monitor
    verbose=1,
    save_best_only=True,
    mode='max'  # Mode of the monitored metric (e.g., max for accuracy, min for loss)
)

In [23]:
def build_lrfn(lr_start=0.002, lr_max=0.010, 
               lr_min=0, lr_rampup_epochs=8, 
               lr_sustain_epochs=0, lr_exp_decay=.8):

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) *\
                 lr_exp_decay**(epoch - lr_rampup_epochs\
                                - lr_sustain_epochs) + lr_min
        return lr
    return lrfn

lrfn = build_lrfn()
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=True)

In [24]:
history = model.fit_generator(train_generator, 
                              validation_data=valid_generator,
                              steps_per_epoch=len(train_generator), 
                              validation_steps=len(valid_generator), 
                              epochs = 40,
                              callbacks=[checkpoint,lr_schedule]
                             )

  history = model.fit_generator(train_generator,



Epoch 1: LearningRateScheduler setting learning rate to 0.002.
Epoch 1/40

Epoch 2: LearningRateScheduler setting learning rate to 0.003.
Epoch 2/40

Epoch 3: LearningRateScheduler setting learning rate to 0.004.
Epoch 3/40

Epoch 4: LearningRateScheduler setting learning rate to 0.005.
Epoch 4/40

Epoch 5: LearningRateScheduler setting learning rate to 0.006.
Epoch 5/40

Epoch 6: LearningRateScheduler setting learning rate to 0.007.
Epoch 6/40

Epoch 7: LearningRateScheduler setting learning rate to 0.008.
Epoch 7/40

Epoch 8: LearningRateScheduler setting learning rate to 0.009000000000000001.
Epoch 8/40

Epoch 9: LearningRateScheduler setting learning rate to 0.01.
Epoch 9/40

Epoch 10: LearningRateScheduler setting learning rate to 0.008.
Epoch 10/40

Epoch 11: LearningRateScheduler setting learning rate to 0.006400000000000001.
Epoch 11/40

Epoch 12: LearningRateScheduler setting learning rate to 0.005120000000000001.
Epoch 12/40

Epoch 13: LearningRateScheduler setting learning 

In [27]:
model.save('chest_xray.h5')


In [28]:
train_df

Unnamed: 0,Image Index,Patient ID,Cardiomegaly,Emphysema,Effusion,Hernia,Infiltration,Mass,Nodule,Atelectasis,Pneumothorax,Pleural_Thickening,Pneumonia,Fibrosis,Edema,Consolidation,FilePath
91774,00022961_007.png,22961,0,0,1,0,0,0,0,0,0,0,0,0,0,0,../input/data/images_010/images/00022961_007.png
47366,00012061_001.png,12061,0,0,0,0,1,0,0,0,0,1,0,0,0,0,../input/data/images_006/images/00012061_001.png
22850,00006049_001.png,6049,0,0,0,0,0,0,0,0,0,0,0,0,0,0,../input/data/images_003/images/00006049_001.png
8739,00002312_004.png,2312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,../input/data/images_002/images/00002312_004.png
43397,00011236_000.png,11236,0,0,0,0,1,0,0,1,0,0,0,0,0,0,../input/data/images_005/images/00011236_000.png
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23837,00006302_000.png,6302,0,0,0,0,0,0,0,0,0,0,0,0,0,0,../input/data/images_003/images/00006302_000.png
41185,00010695_010.png,10695,0,1,0,0,0,0,0,0,0,0,0,0,0,0,../input/data/images_005/images/00010695_010.png
87985,00021824_000.png,21824,0,0,0,0,0,0,0,0,0,0,0,0,0,0,../input/data/images_010/images/00021824_000.png
46777,00011966_007.png,11966,0,0,0,0,0,0,0,0,0,0,1,0,0,0,../input/data/images_006/images/00011966_007.png
