This notebook is work-in-progress and expected to be updated.

Credits to other notebooks:
* [Visual In-Depth EDA – VinBigData Competition Data](https://www.kaggle.com/dschettler8845/visual-in-depth-eda-vinbigdata-competition-data) by @dschettler8845
* [SIIM COVID-19 Detection - a simple EDA 🦠🩺](https://www.kaggle.com/tanlikesmath/siim-covid-19-detection-a-simple-eda) by @tanlikesmath

# Load Libraries and Data

In [None]:
from ast import literal_eval
import cv2
from fastai.vision.all import *
from fastai.medical.imaging import *
from matplotlib import pyplot as plt
import numpy as np # linear algebra
import os
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import seaborn as sns
from warnings import simplefilter

In [None]:
plt.style.use('fivethirtyeight')
simplefilter('ignore')

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    """ Convert dicom file to numpy array 
    
    Args:
        path (str): Path to the dicom file to be converted
        voi_lut (bool): Whether or not VOI LUT is available
        fix_monochrome (bool): Whether or not to apply monochrome fix
        
    Returns:
        Numpy array of the respective dicom file 
        
    """
    # Use the pydicom library to read the dicom file
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to 
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
        
    # The XRAY may look inverted
    #   - If we want to fix this we can
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    
    # Normalize the image array and return
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data

def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
def get_image_id(path):
    """ Function to return the image-id from a path """
    return path.rsplit("/", 1)[1].rsplit(".", 1)[0]

def create_fractional_bbox_coordinates(row):
    """ Function to return bbox coordiantes as fractions from DF row """
    frac_x_min = row["x_min"]/row["img_width"]
    frac_x_max = row["x_max"]/row["img_width"]
    frac_y_min = row["y_min"]/row["img_height"]
    frac_y_max = row["y_max"]/row["img_height"]
    return frac_x_min, frac_x_max, frac_y_min, frac_y_max

def draw_bboxes(img, tl, br, rgb, label="", label_location="tl", opacity=0.1, line_thickness=0):
    """ TBD 
    
    Args:
        TBD
        
    Returns:
        TBD 
    """
    rect = np.uint8(np.ones((br[1]-tl[1], br[0]-tl[0], 3))*rgb)
    sub_combo = cv2.addWeighted(img[tl[1]:br[1],tl[0]:br[0],:], 1-opacity, rect, opacity, 1.0)    
    img[tl[1]:br[1],tl[0]:br[0],:] = sub_combo

    if line_thickness>0:
        img = cv2.rectangle(img, tuple(tl), tuple(br), rgb, line_thickness)
        
    if label:
        # DEFAULTS
        FONT = cv2.FONT_HERSHEY_SIMPLEX
        FONT_SCALE = 1.666
        FONT_THICKNESS = 3
        FONT_LINE_TYPE = cv2.LINE_AA
        
        if type(label)==str:
            LABEL = label.upper().replace(" ", "_")
        else:
            LABEL = f"CLASS_{label:02}"
        
        text_width, text_height = cv2.getTextSize(LABEL, FONT, FONT_SCALE, FONT_THICKNESS)[0]
        
        label_origin = {"tl":tl, "br":br, "tr":(br[0],tl[1]), "bl":(tl[0],br[1])}[label_location]
        label_offset = {
            "tl":np.array([0, -10]), "br":np.array([-text_width, text_height+10]), 
            "tr":np.array([-text_width, -10]), "bl":np.array([0, text_height+10])
        }[label_location]
        img = cv2.putText(img, LABEL, tuple(label_origin+label_offset), 
                          FONT, FONT_SCALE, rgb, FONT_THICKNESS, FONT_LINE_TYPE)
    
    return img

In [None]:
class TrainData():
    def __init__(self, df, train_dir, cmap="Spectral"):
        # Initialize
        self.df = df
        self.train_dir = train_dir
        
        # Visualization
        self.cmap = cmap
        self.pal = [tuple([int(x) for x in np.array(c)*(255,255,255)]) for c in sns.color_palette(cmap, 15)]
        self.pal.pop(8)
        
        # Store df components in individual numpy arrays for easy access based on index
        tmp_numpy = self.df.to_numpy()
        image_ids = self.df.index.values
        class_ids = tmp_numpy[1]
        rad_ids = tmp_numpy[2]
        bboxes = tmp_numpy[3:]
        
        self.img_annotations = self.get_annotations(get_all=True)
        
        # Clean-Up
        del tmp_numpy; gc.collect();
        
        
    def get_annotations(self, get_all=False, image_ids=None, class_ids=None, rad_ids=None, index=None):
        """ TBD 
        
        Args:
            get_all (bool, optional): TBD
            image_ids (list of strs, optional): TBD
            class_ids (list of ints, optional): TBD
            rad_ids (list of strs, optional): TBD
            index (int, optional):
        
        Returns:
        
        
        """
        if not get_all and image_ids is None and class_ids is None and rad_ids is None and index is None:
            raise ValueError("Expected one of the following arguments to be passed:" \
                             "\n\t\t– `get_all`, `image_id`, `class_id`, `rad_id`, or `index`")
        # Initialize
        tmp_df = self.df.copy()
        
        if not get_all:
            if image_ids is not None:
                tmp_df = tmp_df[tmp_df.image_id.isin(image_ids)]
            if class_ids is not None:
                tmp_df = tmp_df[tmp_df.class_id.isin(class_ids)]
            if rad_ids is not None:
                tmp_df = tmp_df[tmp_df.rad_id.isin(rad_ids)]
            if index is not None:
                tmp_df = tmp_df.iloc[index]
            
        annotations = {image_id:[] for image_id in tmp_df.image_id.to_list()}
        for row in tmp_df.to_numpy():
            
            # Update annotations dictionary
            annotations[row[0]].append(dict(
                img_path=os.path.join(self.train_dir, row[0]+".dicom"),
                image_id=row[0],
                class_id=int(row[1]),
                rad_id=int(row[2][1:]),
            ))
            
            # Catch to convert float array to integer array
            if row[1]==14:
                annotations[row[0]][-1]["bbox"]=row[3:]
            else:
                annotations[row[0]][-1]["bbox"]=row[3:].astype(np.int32)
        return annotations
    
    def get_annotated_image(self, image_id, annots=None, plot=False, plot_size=(18,25), plot_title=""):
        if annots is None:
            annots = self.img_annotations.copy()
        
        if type(annots) != list:
            image_annots = annots[image_id]
        else:
            image_annots = annots
            
        img = cv2.cvtColor(dicom2array(image_annots[0]["img_path"]),cv2.COLOR_GRAY2RGB)
        for ann in image_annots:
            if ann["class_id"] != 14:
                img = draw_bboxes(img, 
                                ann["bbox"][:2], ann["bbox"][-2:], 
                                rgb=self.pal[ann["class_id"]], 
                                label=int_2_str[ann["class_id"]], 
                                opacity=0.08, line_thickness=4)
        if plot:
            plot_image(img, title=plot_title, figsize=plot_size)
        
        return img
    
    def plot_image_ids(self, image_id_list, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(image_ids=image_id_list)
        annotated_imgs = []
        n = len(image_id_list)
        
        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()
        
    def plot_classes(self, class_list, n=4, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(class_ids=class_list)
        annotated_imgs = []

        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()

    def plot_radiologists(self, rad_id_list, n=4, height_multiplier=6, verbose=True):
        annotations = self.get_annotations(rad_ids=rad_id_list)
        annotated_imgs = []

        plt.figure(figsize=(20, height_multiplier*n))
        for i, (image_id, annots) in enumerate(annotations.items()):
            if i >= n:
                break
            if verbose:
                print(f".", end="")
            plt.subplot(n//2,2,i+1)
            plt.imshow(self.get_annotated_image(image_id, annots))
            plt.axis(False)
            plt.title(f"Image ID – {image_id}")
        plt.tight_layout(rect=[0, 0.03, 1, 0.97])
        plt.show()

In [None]:
data_dir = Path('../input/siim-covid19-detection')
train_image_level_file = data_dir / 'train_image_level.csv'
train_study_level_file = data_dir / 'train_study_level.csv'
sample_file = data_dir / 'sample_submission.csv'

build_dir = Path('./build')
build_dir.mkdir(parents=True, exist_ok=True)
submission_file = 'submission.csv'

id_col = 'id'
image_label_col = 'label'
image_class_col = 'image_class'
bbox_col = 'boxes'

In [None]:
trn_image = pd.read_csv(train_image_level_file)
print(trn_image.shape)
trn_image.head()

In [None]:
trn_study = pd.read_csv(train_study_level_file)
print(trn_study.shape)
trn_study.head()

# EDA

## Image Level Label

In [None]:
trn_image[bbox_col][0]

In [None]:
trn_image[image_label_col][0]

The `boxes` and `label` columns are encoded as `str`, and we need to convert them to `list`.

In [None]:
trn_image[bbox_col] = trn_image[bbox_col].apply(lambda x: literal_eval(x) if not pd.isnull(x) else [])
print(trn_image[bbox_col].apply(len).describe())
trn_image[bbox_col].apply(len).hist(bins=10, width=.2)

In [None]:
print(trn_image[image_label_col].str.split(' ').str[0].value_counts())
trn_image[image_label_col].str.split(' ').str[0].hist(bins=2, width=.2)

In [None]:
trn_image[image_label_col] = trn_image[image_label_col].str.split(' ').str[::6]
trn_image[image_label_col].head()

In [None]:
trn_bbox = pd.concat([trn_image[[bbox_col]].reset_index().explode(bbox_col), 
                      trn_image[[image_label_col]].reset_index(drop=True).explode(image_label_col)], axis=1)
print(trn_bbox.shape)
trn_bbox.head(10)

In [None]:
print(trn_bbox[image_label_col].value_counts())
trn_bbox[image_label_col].hist(bins=2, width=.2)

Looks good! We have 7,853 bounding boxes from 4,294 images labeled as `opacity`. 

## Study Level Label

Next, let's look at the study level data.

There are four classes: `negative`, `typical`, `indeterminate`, and `atypical`. Let's check their distributions.

In [None]:
trn_study.describe()

Let's check whether it's multiclass (classes are mutually exclusive) or multilabel (classes can overlap).

In [None]:
trn_study.sum(axis=1).describe()

It's multiclass. i.e. each sample (study) belongs to only one class.

## Images

In [None]:
dicom_paths = get_dicom_files(data_dir /'train')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

To be continued