## Defining configuration

In [1]:
import tarfile
from google.colab import drive

drive.mount('/content/drive')
tar = tarfile.open("/content/drive/My Drive/Colab Notebooks/datasets/data-covid-py.tar.gz", "r")
tar.extractall('/content')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
config = {
  'PATHS': {
    'RAW_DATA': os.path.join(os.getcwd(), 'data'),
    'COVID_CHEST_XRAY_DATA': os.path.join(os.getcwd(), 'data', 'covid-chestxray-dataset'),
    'CHEST_XRAY_8_DATA': os.path.join(os.getcwd(), 'data', 'ChestX-ray8'),
    'PROCESSED_DATA': os.path.join(os.getcwd(), 'data', 'processed'),
    'TRAIN_SET': os.path.join(os.getcwd(), 'data', 'processed', 'train_set.csv'),
    'VAL_SET': os.path.join(os.getcwd(), 'data', 'processed', 'val_set.csv'),
    'TEST_SET': os.path.join(os.getcwd(), 'data', 'processed', 'test_set.csv'),
    'IMAGES': os.path.join(os.getcwd(), 'data', 'documents', 'generated_images'),
    'LOGS': os.path.join(os.getcwd(), 'results', 'logs'),
    'MODEL_WEIGHTS': os.path.join(os.getcwd(), 'results', 'models'),
    #'MODEL_TO_LOAD': os.path.join(os.getcwd(), 'results', 'models', '.h5'),
    'OUTPUT_CLASS_INDICES': os.path.join(os.getcwd(), 'data', 'interpretability', 'output_class_indices.pkl')
  },
  'DATA': {
    'IMG_DIM': [
      224,
      224
    ],
    'VIEW': 'PA',
    'VAL_SPLIT_PERCENT': 0.08,
    'TEST_SPLIT_PERCENT': 0.1,
    'NUM_CHEST_XRAY_8_IMAGES': 1000,
    'CLASSES': [
      'COVID-19',
      'OTHER'
    ]
  },
  'TRAIN': {
    'BATCH_SIZE': 32,
    'EPOCHS': 200,
    'THRESHOLDS': 0.5,
    'PATIENCE_FOR_EARLY_STOPPING': 7,
    'METRIC_PREFERENCE': [
      'auc',
      'recall',
      'precision',
      'loss'
    ],
    'NUM_RUNS': 10,
    'NUM_GPUS': 0
  },
  'NN': {
    'KERNEL_SIZE': '(3,3)',
    'STRIDES': '(1,1)',
    'INIT_FILTERS': 16,
    'FILTER_EXP_BASE': 3,
    'MAXPOOL_SIZE': '(2,2)',
    'CONV_BLOCKS': 3,
    'NODES_DENSE0': 128,
    'LR': 1e-05,
    'OPTIMIZER': 'adam',
    'DROPOUT': 0.4,
    'L2_LAMBDA': 0.0001
  }
}

## Preprocessing
### Importing dependencies

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from pathlib import Path
import re

for path in config['PATHS']:
    if not bool(re.match('^.*\.[a-zA-Z0-9]{1,4}$', config['PATHS'][path])):
        Path(config['PATHS'][path]).mkdir(parents=True, exist_ok=True)
    else:
        splitted_path = config['PATHS'][path][:config['PATHS'][path].rfind('/')]
        Path(splitted_path).mkdir(parents=True, exist_ok=True)

In [0]:
covid_chest_xray_path = config['PATHS']['COVID_CHEST_XRAY_DATA']
chest_xray_8_path = config['PATHS']['CHEST_XRAY_8_DATA']

covid_chest_xray_df = pd.read_csv(os.path.join(covid_chest_xray_path, 'metadata.csv'))
covid_chest_xray_df['filename'] = [os.path.join(covid_chest_xray_path, 'images', row) for row in covid_chest_xray_df['filename'].astype(str)]

covid_views_cxrs_df = covid_chest_xray_df['view'].str.match(config['DATA']['VIEW'])
covid_pos_df = covid_chest_xray_df['finding'].str.match('COVID-19')
covid_df = covid_chest_xray_df[covid_pos_df & covid_views_cxrs_df] 

chest_xray_8_df = pd.read_csv(os.path.join(chest_xray_8_path, 'subset.csv'))
num_chest_xray_8_imgs = config['DATA']['NUM_CHEST_XRAY_8_IMAGES']
chest_xray_8_normal_df = chest_xray_8_df[chest_xray_8_df['Finding Labels'].str.match('No Finding')]
chest_xray_8_pneum_df = chest_xray_8_df[chest_xray_8_df['Finding Labels'].str.match('(?!No Finding)')]

chest_xray_8_normal_sample_df = chest_xray_8_normal_df.sample(frac = num_chest_xray_8_imgs / chest_xray_8_normal_df.shape[0], random_state=num_chest_xray_8_imgs)

chest_xray_8_pneum_sample_df = chest_xray_8_pneum_df.sample(frac = num_chest_xray_8_imgs / chest_xray_8_pneum_df.shape[0], random_state=num_chest_xray_8_imgs)

chest_xray_8_df = pd.concat([chest_xray_8_normal_sample_df, chest_xray_8_pneum_sample_df], axis=0)

chest_xray_8_df['filename'] = [os.path.join(chest_xray_8_path, row) for row in chest_xray_8_df['Image Index'].astype(str)]

In [5]:
covid_df['label'] = 'COVID-19'
chest_xray_8_df['label'] = 'OTHER'

file_df = pd.concat(
        [covid_df[['filename', 'label']],
        chest_xray_8_df[['filename', 'label']]], axis=0)         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [0]:
validation_split_size = config['DATA']['VAL_SPLIT_PERCENT']
test_split_size = config['DATA']['TEST_SPLIT_PERCENT']
file_df_train, file_df_test = train_test_split(file_df, test_size=test_split_size, stratify=file_df['label'])
relative_validation_split_size = validation_split_size / (1 - test_split_size)
file_df_train, file_df_val = train_test_split(file_df_train, test_size=relative_validation_split_size,
                                                    stratify=file_df_train['label'])

if not os.path.exists(config['PATHS']['PROCESSED_DATA']):
    os.makedirs(config['PATHS']['PROCESSED_DATA'])
file_df_train.to_csv(config['PATHS']['TRAIN_SET'])
file_df_val.to_csv(config['PATHS']['VAL_SET'])
file_df_test.to_csv(config['PATHS']['TEST_SET'])

In [0]:
import pandas as pd
import os
import datetime
import dill
import numpy as np
from math import ceil
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall, AUC
from tensorflow.keras.models import save_model
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_addons as tfa
from tensorflow_addons.metrics import F1Score

In [0]:
cur_date = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
log_dir = os.path.join(config['PATHS']['LOGS'], 'training', cur_date)
if not os.path.exists(os.path.join(config['PATHS']['LOGS'], 'training')):
    os.makedirs(os.path.join(config['PATHS']['LOGS'], 'training'))

data = {}
data['TRAIN'] = pd.read_csv(config['PATHS']['TRAIN_SET'])
data['VAL'] = pd.read_csv(config['PATHS']['VAL_SET'])
data['TEST'] = pd.read_csv(config['PATHS']['TEST_SET'])

early_stopping = EarlyStopping(
    monitor='val_loss',
    verbose=1, 
    patience=config['TRAIN']['PATIENCE_FOR_EARLY_STOPPING'], 
    mode='min', 
    restore_best_weights=False)
callbacks = [early_stopping]

In [34]:
tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=1)
callbacks.append(tensorboard)

train_img_gen = ImageDataGenerator(rotation_range=10, samplewise_std_normalization=True, samplewise_center=True)
val_img_gen = ImageDataGenerator(samplewise_std_normalization=True, samplewise_center=True)
test_img_gen = ImageDataGenerator(samplewise_std_normalization=True, samplewise_center=True)

img_shape = tuple(config['DATA']['IMG_DIM'])

class_mode = 'categorical'
train_generator = train_img_gen.flow_from_dataframe(
    dataframe=data['TRAIN'],
    x_col="filename",
    y_col='label',
    target_size=img_shape,
    batch_size=config['TRAIN']['BATCH_SIZE'],
    class_mode=class_mode,
    validate_filenames=False)
val_generator = val_img_gen.flow_from_dataframe(
    dataframe=data['VAL'],
    x_col="filename",
    y_col='label',
    target_size=img_shape,
    batch_size=config['TRAIN']['BATCH_SIZE'],
    class_mode=class_mode,
    validate_filenames=False)
test_generator = test_img_gen.flow_from_dataframe(
    dataframe=data['TEST'],
    x_col="filename",
    y_col='label',
    target_size=img_shape,
    batch_size=config['TRAIN']['BATCH_SIZE'],
    class_mode=class_mode,
    validate_filenames=False,
    shuffle=False)

dill.dump(test_generator.class_indices, open(config['PATHS']['OUTPUT_CLASS_INDICES'], 'wb+'))

histogram = np.bincount(np.array(train_generator.labels).astype(int))

class_multiplier_list = [min(histogram) / max(histogram)]
class_multiplier_list.insert(int(histogram[0] > histogram[1]), 1.0)

class_multiplier = [
        class_multiplier_list[config['DATA']['CLASSES'].index(c)]
            for c in test_generator.class_indices
]

weights = [(1.0 / len(histogram)) * sum(histogram) / histogram[i] for i in range(len(histogram))]

class_weight = {i: class_multiplier[i] for i in range(len(histogram))}  

Found 1764 non-validated image filenames belonging to 2 classes.
Found 173 non-validated image filenames belonging to 2 classes.
Found 216 non-validated image filenames belonging to 2 classes.


In [29]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout, Input, LeakyReLU, Activation, GlobalAveragePooling2D, GlobalMaxPooling2D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.initializers import Constant
from tensorflow.keras.applications import ResNet101V2
from tensorflow.keras.utils import multi_gpu_model

covid_class_idx = test_generator.class_indices['COVID-19']   
thresholds = 1.0 / len(config['DATA']['CLASSES'])
metrics = ['accuracy', CategoricalAccuracy(name='c_accuracy'),
    Precision(name='precision', thresholds=thresholds, class_id=covid_class_idx),
    Recall(name='recall', thresholds=thresholds, class_id=covid_class_idx),
    AUC(name='auc'),
    F1Score(name='f1score', threshold=thresholds, num_classes=len(config['DATA']['CLASSES']))]

input_shape = config['DATA']['IMG_DIM'] + [3]
num_gpus = config['TRAIN']['NUM_GPUS']

model_config = config['NN']

nodes_dense0 = model_config['NODES_DENSE0']
lr = model_config['LR']
dropout = model_config['DROPOUT']
l2_lambda = model_config['L2_LAMBDA']

if model_config['OPTIMIZER'] == 'sgd':
    optimizer = SGD(learning_rate=lr)
else:
    optimizer = Adam(learning_rate=lr)

histogram = np.bincount([config['DATA']['CLASSES'].index(label) for label in data['TRAIN']['label'].astype(str)])
output_bias = np.log([histogram[i] / (np.sum(histogram) - histogram[i]) for i in range(histogram.shape[0])])

# Set output bias
if output_bias is not None:
    output_bias = Constant(output_bias)
print("MODEL CONFIG: ", model_config)

X_input = Input(input_shape, name='input_img')
base_model = ResNet101V2(include_top=False, weights='imagenet', input_shape=input_shape, input_tensor=X_input)
base_model.trainable = False
X = base_model.output

# Add custom top
X = GlobalMaxPooling2D()(X)
X = Dropout(dropout)(X)
X = Dense(nodes_dense0, kernel_initializer='he_uniform', activity_regularizer=l2(l2_lambda))(X)
X = LeakyReLU()(X)
X = Dense(len(config['DATA']['CLASSES']), bias_initializer=output_bias)(X)
Y = Activation('softmax', dtype='float32', name='output')(X)

model = Model(inputs=X_input, outputs=Y)
model.summary()

if num_gpus >= 2:
    model = multi_gpu_model(model, gpus=num_gpus)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=metrics)

model_path = os.path.join(config['PATHS']['MODEL_WEIGHTS'], '{}{}{}'.format(model, cur_date, '.h5'))
save_model(model, model_path)

MODEL CONFIG:  {'KERNEL_SIZE': '(3,3)', 'STRIDES': '(1,1)', 'INIT_FILTERS': 16, 'FILTER_EXP_BASE': 3, 'MAXPOOL_SIZE': '(2,2)', 'CONV_BLOCKS': 3, 'NODES_DENSE0': 128, 'LR': 1e-05, 'OPTIMIZER': 'adam', 'DROPOUT': 0.4, 'L2_LAMBDA': 0.0001}
Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_img (InputLayer)          [(None, 512, 512, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 518, 518, 3)  0           input_img[0][0]                  
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 256, 256, 64) 9472        conv1_pad[0][0]                  
_____________________________________________________

In [30]:
steps_per_epoch = ceil(train_generator.n / train_generator.batch_size)
val_steps = ceil(val_generator.n / val_generator.batch_size)
history = model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=config['TRAIN']['EPOCHS'],
                                validation_data=val_generator, validation_steps=val_steps, callbacks=callbacks,
                                verbose=True, class_weight=class_weight)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 00018: early stopping


In [0]:
test_results = model.evaluate_generator(test_generator, verbose=1)
test_metrics = {}
test_summary_str = [['**Metric**', '**Value**']]
for metric, value in zip(model.metrics_names, test_results):
    test_metrics[metric] = value
    print(metric, ' = ', value)
    test_summary_str.append([metric, str(value)])