## What are we predicting?

In this competition, you’ll detect the presence and position of catheters and lines on chest x-rays. Use machine learning to train and test your model on 40,000 images to categorize a tube that is poorly placed.

## Evaluation criteria?

Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.
To calculate the final score, AUC is calculated for each of the 11 labels, then averaged. The score is then the average of the individual AUCs of each predicted column.

## Train vs Test?

A code-only competition so there is a hidden test set (approximately 4x larger, with ~14k images) as well.

train.csv contains image IDs, binary labels, and patient IDs.

TFRecords are available for both train and test. (They are also available for the hidden test set.)

train_annotations.csv includes segmentation annotations for training samples that have them as solely additional information.

## Similar Dataset & Competitions?

[RSNA Pneumonia Detection Challenge]("https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/notebooks")

[SIIM-ACR Pneumothorax Segmentation]("https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation")

[XRay Lung Segmentation]("https://www.kaggle.com/c/xray-lung-segmentation/data")

[RSNA Pneumonia Detection Challenge]("https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/notebooks")

Have more?

In [None]:
import tensorflow as tf
from tensorflow.keras.applications.densenet import DenseNet121
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.nasnet import NASNetMobile, NASNetLarge
from tensorflow.keras.applications.inception_resnet_v2 import InceptionResNetV2

# base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(256, 256, 3))
# x = base_model.output
# x = tf.keras.layers.GlobalAveragePooling2D()(x)
# output = tf.keras.layers.Dense(len(labels), activation="sigmoid")(x)
# model = tf.keras.Model(base_model.input, output)
# model.compile(optimizer=tf.keras.optimizers.RMSprop(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
import os

# import efficientnet.tfkeras as efn
import numpy as np
import pandas as pd
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split
import tensorflow as tf

## Helper functions and configurations

The following functions are defined below (unhide to see):
```python
auto_select_accelerator()

build_decoder(with_labels=True, target_size=(256, 256), ext='jpg')

build_augmenter(with_labels=True)

build_dataset(paths, labels=None, bsize=32, cache=True,
              decode_fn=None, augment_fn=None,
              augment=True, repeat=True, shuffle=1024, 
              cache_dir="")
```

In [None]:
def auto_select_accelerator():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    
    return strategy


def build_decoder(with_labels=True, target_size=(256, 256), ext='jpg'):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")

        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)

        return img
    
    def decode_with_labels(path, label):
        return decode(path), label
    
    return decode_with_labels if with_labels else decode


def build_augmenter(with_labels=True):
    def augment(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        return img
    
    def augment_with_labels(img, label):
        return augment(img), label
    
    return augment_with_labels if with_labels else augment


def build_dataset(paths, labels=None, bsize=32, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=True, repeat=True, shuffle=1024, 
                  cache_dir=""):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    
    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)
    
    if augment_fn is None:
        augment_fn = build_augmenter(labels is not None)
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.cache(cache_dir) if cache else dset
    dset = dset.map(augment_fn, num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(bsize).prefetch(AUTO)
    
    return dset

In [None]:
COMPETITION_NAME = "ranzcr-clip-catheter-line-classification"
strategy = auto_select_accelerator()
BATCH_SIZE = strategy.num_replicas_in_sync * 16
GCS_DS_PATH = KaggleDatasets().get_gcs_path(COMPETITION_NAME)

## Preparing dataset

In [None]:
load_dir = f"/kaggle/input/{COMPETITION_NAME}/"
df = pd.read_csv(load_dir + 'train.csv')

# paths = load_dir + "train/" + df['StudyInstanceUID'] + '.jpg'
paths = GCS_DS_PATH + "/train/" + df['StudyInstanceUID'] + '.jpg'

sub_df = pd.read_csv(load_dir + 'sample_submission.csv')

# test_paths = load_dir + "test/" + sub_df['StudyInstanceUID'] + '.jpg'
test_paths = GCS_DS_PATH + "/test/" + sub_df['StudyInstanceUID'] + '.jpg'

# Get the multi-labels
label_cols = sub_df.columns[1:]
labels = df[label_cols].values

In [None]:
# Train test split
(train_paths, valid_paths, train_labels, valid_labels) = train_test_split(paths, labels, test_size=0.2, random_state=42)

In [None]:
# Build the tensorflow datasets
IMSIZE = (224, 240, 260, 300, 380, 456, 528, 600)

decoder = build_decoder(with_labels=True, target_size=(IMSIZE[6], IMSIZE[6]))
test_decoder = build_decoder(with_labels=False, target_size=(IMSIZE[6], IMSIZE[6]))

train_dataset = build_dataset(
    train_paths, train_labels, bsize=BATCH_SIZE, decode_fn=decoder
)

valid_dataset = build_dataset(
    valid_paths, valid_labels, bsize=BATCH_SIZE, decode_fn=decoder,
    repeat=False, shuffle=False, augment=False
)

test_dataset = build_dataset(
    test_paths, cache=False, bsize=BATCH_SIZE, decode_fn=test_decoder,
    repeat=False, shuffle=False, augment=False
)

## Modeling

In [None]:
n_labels = labels.shape[1]

with strategy.scope():
    model = tf.keras.Sequential([
        InceptionResNetV2(
            input_shape=(IMSIZE[6], IMSIZE[6], 3),
            weights='imagenet',
            include_top=False),
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(n_labels, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(multi_label=True)])
    model.summary()

In [None]:
steps_per_epoch = train_paths.shape[0] // BATCH_SIZE
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    'model.h5', save_best_only=True, monitor='val_auc', mode='max')
lr_reducer = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_auc", patience=3, min_lr=1e-6, mode='max')

In [None]:
history = model.fit(
    train_dataset, 
    epochs=30,
    verbose=2,
    callbacks=[checkpoint, lr_reducer],
    steps_per_epoch=steps_per_epoch,
    validation_data=valid_dataset)

## Submission

In [None]:
hist_df = pd.DataFrame(history.history)
hist_df.to_csv('history.csv')