This kernel is based primarily on:

- Competition: https://www.kaggle.com/c/siim-covid19-detection/overview
- Starter kernel: https://www.kaggle.com/ayuraj/train-covid-19-detection-using-yolov5
- Primary dataset: https://www.kaggle.com/c/siim-covid19-detection/data
- Resized dataset: https://www.kaggle.com/xhlulu/siim-covid19-resized-to-1024px-jpg (others [here](https://www.kaggle.com/c/siim-covid19-detection/discussion/239918))
- Detection model: YOLOv5 - https://github.com/ultralytics/yolov5
- Tracking: Weights and Biases (integrated with YOLOv5)

Notes:

- RUN THE EXPERIMENT ON A GPU INSTANCE! This is going to take a sweet, sweet while either way.

# Setup and Variables

In [None]:
# WANT_WANDB defined in the initial setup block.
# Are we training on the original dataset (and thus need to resize it, etc)?
RUN_ON_ORIGINAL = False

# Run only inference tasks? (You still need the separate noteboook for now.)
INFERENCE_ONLY = False

YOLOV5_REPO = '/kaggle/input/ultralyticsyolov5a'
KAGGLE_DATASET = '/kaggle/input/siim-covid19-detection'

PROJECT_NAME = 'kaggle-siim-covid'
WEIGHTS_FILE = 'yolov5s.pt'
IMG_SIZE = 1024
BATCH_SIZE = 16
EPOCHS = 10

# Pick the data source to train on:
KAGGLE_RESIZED = '/kaggle/tmp' if RUN_ON_ORIGINAL else f'/kaggle/input/siim-covid19-resized-to-{IMG_SIZE}px-jpg'
TRAIN_PATH = KAGGLE_RESIZED + '/train/'

In [None]:
%cd /kaggle/working

# GDCM:
!cp /kaggle/input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2

# YOLOv5:
!cp -r /kaggle/input/ultralyticsyolov5a yolov5
!cp /kaggle/input/ultralyticsyolov5aweights/* yolov5/

# If you want to supply your own pretrained model, dump it into a
# dataset and copy it over to the YOLOv5 directory here.
#!cp /kaggle/input/some-stuff/best.pt yolov5/

# Are we training on the original dataset (and thus need to resize it, etc)?
TRAIN_ON_ORIGINAL = False

In [None]:
import os, pathlib

# If we're training on original data, we don't want to save it into W&B for now.
WANT_WANDB = not RUN_ON_ORIGINAL
if WANT_WANDB:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    os.environ['WANDB_API_KEY'] = UserSecretsClient().get_secret("WANDB_KEY")

    import wandb
    wandb.login()

import gc
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from shutil import copyfile
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import torch
print(f"Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

In [None]:
#customize iPython writefile so we can write variables
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))

# Convert data

Original data is in the DICOM format which is hard to work with. Convert it to something we can use first (if needed).

In [None]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    # Original from: https://www.kaggle.com/xhlulu/vinbigdata-process-and-resize-to-image
    im = Image.fromarray(array)
    
    if keep_ratio:
        im.thumbnail((size, size), resample)
    else:
        im = im.resize((size, size), resample)
    
    return im

def convert_dataset():
    image_id = []
    dim0 = []
    dim1 = []
    splits = []
    
    if INFERENCE_ONLY:
        valid_splits = ['test']
    else:
        valid_splits = ['test', 'train']

    # NOTE: For inference only, all you need is test:
    for split in valid_splits:
        save_dir = f'{KAGGLE_RESIZED}/{split}/'

        os.makedirs(save_dir, exist_ok=True)

        for dirname, _, filenames in tqdm(os.walk(f'{KAGGLE_DATASET}/{split}')):
            for file in filenames:
                # set keep_ratio=True to have original aspect ratio
                xray = read_xray(os.path.join(dirname, file))
                im = resize(xray, size=IMG_SIZE)
                im.save(os.path.join(save_dir, file.replace('dcm', 'jpg')))

                image_id.append(file.replace('.dcm', ''))
                dim0.append(xray.shape[0])
                dim1.append(xray.shape[1])
                splits.append(split)

    df = pd.DataFrame.from_dict({'image_id': image_id, 'dim0': dim0, 'dim1': dim1, 'split': splits})
    df.to_csv(KAGGLE_RESIZED + '/meta.csv', index=False)


if RUN_ON_ORIGINAL:
    convert_dataset()

# Prepare Dataset

YOLOv5 requires its dataset in a COCO dataset format.

In [None]:
# Everything is done from /kaggle directory.
%cd /kaggle

#df_labels_long = ["Negative for Pneumonia", "Typical Appearance", "Indeterminate Appearance", "Atypical Appearance"]
df_labels_long = ["_2", "_3", "_4", "_5"]
df_labels = ["none", "negative", "typical", "indterminate", "atypical"]
image_label_map = {}

def get_label_from_row(row):
    for i in range(len(df_labels_long)):
        if getattr(row, df_labels_long[i]) > 0:
            return df_labels[i + 1]
    return df_labels[0]

study_df = pd.read_csv(KAGGLE_DATASET + '/train_study_level.csv')
for row in study_df.itertuples():
    image_label_map[row.id.split('_')[0]] = get_label_from_row(row)

# Load image level csv file
df = pd.read_csv(KAGGLE_DATASET + '/train_image_level.csv')

# Modify values in the id column
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
# Add absolute path
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
# Get image level labels
df['image_level'] = df.apply(lambda row: image_label_map[row.StudyInstanceUID], axis=1)

df.head(5)

In [None]:
# Load meta.csv file
# Original dimensions are required to scale the bounding box coordinates appropriately.
meta_df = pd.read_csv(KAGGLE_RESIZED + '/meta.csv')
train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df = train_meta_df.drop('split', axis=1)
train_meta_df.columns = ['id', 'dim0', 'dim1']

train_meta_df.head(2)

In [None]:
# Merge both the dataframes
df = df.merge(train_meta_df, on='id',how="left")
df.head(2)

In [None]:
# Create train and validation split.
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df.image_level.values)

train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

df = pd.concat([train_df, valid_df]).reset_index(drop=True)

In [None]:
print(f'Size of dataset: {len(df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

## 🍚 Prepare Required Folder Structure

The required folder structure for the dataset directory is: 

```
/parent_folder
    /dataset
         /images
             /train
             /val
         /labels
             /train
             /val
    /yolov5
```

Note that I have named the directory `covid`.

In [None]:
os.makedirs('/kaggle/working/covid/images/train', exist_ok=True)
os.makedirs('/kaggle/working/covid/images/valid', exist_ok=True)

os.makedirs('/kaggle/working/covid/labels/train', exist_ok=True)
os.makedirs('/kaggle/working/covid/labels/valid', exist_ok=True)

! ls /kaggle/working/covid/images

In [None]:
# Move the images to relevant split folder.
for i in tqdm(range(len(df))):
    row = df.loc[i]
    if row.split == 'train':
        copyfile(row.path, f'/kaggle/working/covid/images/train/{row.id}.jpg')
    else:
        copyfile(row.path, f'/kaggle/working/covid/images/valid/{row.id}.jpg')

## 🍜 Create `.YAML` file

The `data.yaml`, is the dataset configuration file that defines 

1. an "optional" download command/URL for auto-downloading, 
2. a path to a directory of training images (or path to a *.txt file with a list of training images), 
3. a path to a directory of validation images (or path to a *.txt file with a list of validation images), 
4. the number of classes, 
5. a list of class names.

> 📍 Important: In this competition, each image can either belong to `opacity` or `none` image-level labels. That's why I have  used the number of classes, `nc` to be 2. YOLOv5 automatically handles the images without any bounding box coordinates. 

> 📍 Note: The `data.yaml` is created in the `yolov5/data` directory as required. 

In [None]:
# Create .yaml file 
import yaml

data_yaml = dict(
    train = '/kaggle/working/covid/images/train',
    val = '/kaggle/working/covid/images/valid',
    nc = 5,
    names = df_labels
)

# Note that I am creating the file in the yolov5/data/ directory.
with open('/kaggle/working/yolov5/data/data.yaml', 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)
    
%cat /kaggle/working/yolov5/data/data.yaml

## 🍮 Prepare Bounding Box Coordinated for YOLOv5

For every image with **bounding box(es)** a `.txt` file with the same name as the image will be created in the format shown below:

* One row per object. <br>
* Each row is class `x_center y_center width height format`. <br>
* Box coordinates must be in normalized xywh format (from 0 - 1). We can normalize by the boxes in pixels by dividing `x_center` and `width` by image width, and `y_center` and `height` by image height. <br>
* Class numbers are zero-indexed (start from 0). <br>

> 📍 Note: We don't have to remove the images without bounding boxes from the training or validation sets. 

In [None]:
# Get the raw bounding box by parsing the row value of the label column.
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [None]:
# Prepare the txt files for bounding box
for i in tqdm(range(len(df))):
    row = df.loc[i]
    # Get image id
    img_id = row.id
    # Get split
    split = row.split
    # Get image-level label
    label = row.image_level
    
    if row.split=='train':
        file_name = f'/kaggle/working/covid/labels/train/{row.id}.txt'
    else:
        file_name = f'/kaggle/working/covid/labels/valid/{row.id}.txt'
        
    
    if label!='none':
        # Get bboxes
        bboxes = get_bbox(row)
        # Scale bounding boxes
        scale_bboxes = scale_bbox(row, bboxes)
        # Format for YOLOv5
        yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        with open(file_name, 'w') as f:
            for bbox in yolo_bboxes:
                bbox = [df_labels.index(label)]+bbox
                bbox = [str(i) for i in bbox]
                bbox = ' '.join(bbox)
                f.write(bbox)
                f.write('\n')

# Train

In [None]:
%cd /kaggle/working/yolov5/
!python train.py --img {IMG_SIZE} \
                 --batch {BATCH_SIZE} \
                 --epochs {EPOCHS} \
                 --data data.yaml \
                 --weights {WEIGHTS_FILE} \
                 --save_period 1 \
                 --project {PROJECT_NAME}

The best model is automatically uploaded to W&B where it can be downloaded and uploaded into a Kaggle dataset (for use in a separate inference notebook).