In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import re 
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split

In [7]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Device:', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
    strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)
    
print(tf.__version__)

In [8]:
A = tf.data.experimental.AUTOTUNE
GP = KaggleDatasets().get_gcs_path()
BS = 16*strategy.num_replicas_in_sync
IS = [180,180]
E = 25

### Load the data

In [9]:
fn = tf.io.gfile.glob(str(GP + '/chest_xray/train/*/*'))
fn.extend(tf.io.gfile.glob(str(GP + '/chest_xray/val/*/*')))

rf, vf = train_test_split(fn, test_size = 0.2)

In [29]:
cn = len([f for f in rf if 'NORMAL' in f])
cp = len([f for f in rf if 'PNEUMONIA' in f])

print('Normal image count : ',cn)
print('Pneumonia image count : ',cp)

In [11]:
rl = tf.data.Dataset.from_tensor_slices(rf)
vl = tf.data.Dataset.from_tensor_slices(vf)

for f in rl.take(5):
    print(f.numpy())

In [12]:
rc = tf.data.experimental.cardinality(rl).numpy()
vc = tf.data.experimental.cardinality(vl).numpy()

print('Training images count : ',str(rc))
print('Validation images count : ',str(vc))

In [30]:
cna = np.array([str(tf.strings.split(i, os.path.sep)[-1].numpy())[2:-1]
              for i in tf.io.gfile.glob(str(GP+'/chest_xray/train/*'))])
cna

In [14]:
def get_label(fp):
    p = tf.strings.split(fp,os.path.sep)
    return p[-2] == 'PNEUMONIA'

In [15]:
def decode_img(i):
    i = tf.image.decode_jpeg(i, channels = 3)
    i = tf.image.convert_image_dtype(i, tf.float32)
    return tf.image.resize(i, IS)

In [16]:
def process_path(f):
    l = get_label(f)
    i = tf.io.read_file(f)
    i = decode_img(i)
    return i,l

In [17]:
rd = rl.map(process_path, num_parallel_calls = A)
vd = vl.map(process_path, num_parallel_calls = A)

In [18]:
for image, label in rd.take(3):
    print('Image shape : ',image.numpy().shape)
    print('Label : ',label.numpy())

In [41]:
el = tf.data.Dataset.list_files(str(GP + '/chest_xray/test/*/*'))
ec = tf.data.experimental.cardinality(el).numpy()
ed = el.map(process_path, num_parallel_calls = A)
ed = ed.batch(BS)

ec

### Visualize the dataset

In [20]:
def prepare_for_training(ds, cache = True, sbs = 1000):
    if cache:
        if isinstance(cache,str):
            ds = ds.cache(cache)
        else:
            ds = ds.cache()
    ds = ds.shuffle(buffer_size = sbs)
    
    ds = ds.repeat()
    ds = ds.batch(BS)
    ds = ds.prefetch(buffer_size = A)
    
    return ds

In [21]:
rd = prepare_for_training(rd)
vd = prepare_for_training(vd)

ib, lb = next(iter(rd))

In [22]:
def show_batch(ib, lb):
    plt.figure(figsize = (10,10))
    for n in range(15):
        ax = plt.subplot(5,5,n+1)
        plt.imshow(ib[n])
        if lb[n]:
            plt.title('PNEUMONIA')
        else:
            plt.title('NORMAL')
        plt.axis('off')

In [23]:
show_batch(ib.numpy(),lb.numpy())

### Build the CNN

In [24]:
def conv_block(f):
    block = tf.keras.Sequential([
        tf.keras.layers.SeparableConvolution2D(f, 3, activation = 'relu', padding = 'same'),
        tf.keras.layers.SeparableConvolution2D(f, 3, activation = 'relu', padding = 'same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPool2D()])
    return block

In [25]:
def dense_block(u,dr):
    block = tf.keras.Sequential([
        tf.keras.layers.Dense(u, activation = 'relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(dr)
    ])
    return block

In [26]:
def build_model():
    model = tf.keras.Sequential([
        tf.keras.Input(shape = (IS[0],IS[1],3)),
        tf.keras.layers.Conv2D(16,3, activation = 'relu', padding = 'same'),
        tf.keras.layers.Conv2D(16,3, activation = 'relu', padding = 'same'),
        tf.keras.layers.MaxPool2D(),
        conv_block(32),
        conv_block(64),
        conv_block(128),
        tf.keras.layers.Dropout(0.2),
        conv_block(256),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Flatten(),
        dense_block(512,0.7),
        dense_block(128,0.5),
        dense_block(64,0.3),
        tf.keras.layers.Dense(1, activation = 'sigmoid')])
    
    return model

### Correct the data imbalance

In [31]:
initial_bias = np.log([cp / cn])
initial_bias

In [32]:
w0 = (1/cn)*(rc)/2.0
w1 = (1/cp)*(rc)/2.0

cw = {0:w0, 1:w1}

print('weight for class 0 : {:.2f}'.format(w0))
print('weight for class 1 : {:.2f}'.format(w1))

### Train the model

In [33]:
with strategy.scope():
    model = build_model()
    
    M = ['accuracy',
        tf.keras.metrics.Precision(name = 'precision'),
        tf.keras.metrics.Recall(name = 'recall')]
    
    model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = M)
    
model.summary()

### Finetune the model

In [34]:
ccb = tf.keras.callbacks.ModelCheckpoint('xray_model.h5', save_best_only = True)
esc = tf.keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

In [43]:
import tensorflow as tf

def edi(lr,s):
    def edf(e):
        return lr*0.1**(e/s)
    return edf

edf = edi(0.01,20)
ls = tf.keras.callbacks.LearningRateScheduler(edf)

In [37]:
history = model.fit(rd,
                   steps_per_epoch = rc//BS,
                   epochs = E,
                   validation_data = vd,
                   validation_steps = vc//BS,
                    class_weight = cw,
                   callbacks = [ccb, esc, ls])

### Data Visualization

In [38]:
fig, ax = plt.subplots(1,4, figsize = (20,3))
ax = ax.ravel()

for i,met in enumerate(['precision','recall','accuracy','loss']):
    ax[i].plot(history.history[met])
    ax[i].plot(history.history['val_'+ met])
    ax[i].set_title('Model {}'.format(met))
    ax[i].set_xlabel('epochs')
    ax[i].set_ylabel(met)
    ax[i].legend(['train','val'])

### Predict and evaluate results

In [44]:
loss, acc, prec, rec = model.evaluate(ed)

In [45]:
print('loss : {:.2f}'.format(loss))
print('accuracy : {:.2f}'.format(acc))
print('precision : {:.2f}'.format(prec))
print('recall : {:.2f}'.format(rec))