In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
from tqdm.notebook import tqdm
!conda install gdcm -c conda-forge -y
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from skimage import exposure
import cv2
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset_dir = '../input/siim-covid19-detection'

In [None]:
def dicom2array(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to
    # transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data
        
    
def plot_img(img, size=(7, 7), is_rgb=True, title="", cmap='gray'):
    plt.figure(figsize=size)
    plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


def plot_imgs(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()


# simple plot

In [None]:
dicom_paths = glob(f'{dataset_dir}/train/*/*/*.dcm')
imgs = [dicom2array(path) for path in dicom_paths[:4]]
plot_imgs(imgs)

Maybe, you can try some preprocess like equalize histogram. You can see the difference between before and after

In [None]:
imgs = [exposure.equalize_hist(img) for img in imgs]
plot_imgs(imgs)

# load & preprocess

In [None]:
from bokeh.plotting import figure as bokeh_figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.models.widgets import Tabs
import pandas as pd
from PIL import Image
from sklearn import preprocessing
import random
from random import randint

In [None]:
train = pd.read_csv(f'{dataset_dir}/train_image_level.csv')
train_study = pd.read_csv(f'{dataset_dir}/train_study_level.csv')
train.head()

In [None]:
train_study.head()

In [None]:
# merge study csv
train_study['StudyInstanceUID'] = train_study['id'].apply(lambda x: x.replace('_study', ''))
del train_study['id']
train = train.merge(train_study, on='StudyInstanceUID')
train.head()

In [None]:
# add StudyInstanceUID_count column
group_col = 'StudyInstanceUID'
df=pd.DataFrame(train.groupby(group_col)['id'].count())
df.columns = [f'{group_col}_count']
train=train.merge(df.reset_index(), on=group_col)
one_study_multi_image_df = train[train[f'{group_col}_count'] > 1]
print(len(one_study_multi_image_df))
train = train[train[f'{group_col}_count'] == 1] # delete 'StudyInstanceUID_count > 1' data
one_study_multi_image_df.head()

we have 512images with 'StudyInstanceUID_count > 1'.  
Since the problem has not been solved, I deleted this data.  
https://www.kaggle.com/c/siim-covid19-detection/discussion/239980

In [None]:
def pie_plot(train_df, variable):
    """
        input: variable ex: "Sex"
        output: bar plot & value count
    """
    # get feature
    var = train_df[variable]
    # count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (20,10))
    plt.pie(varValue, labels=varValue.index, autopct="%1.1f%%")
#     plt.xticks(varValue.index, varValue.index.values)
#     plt.ylabel("Frequency")
    plt.title('target')
    plt.show()
    
train['target'] = 'Negative for Pneumonia'
train.loc[train['Typical Appearance']==1, 'target'] = 'Typical Appearance'
train.loc[train['Indeterminate Appearance']==1, 'target'] = 'Indeterminate Appearance'
train.loc[train['Atypical Appearance']==1, 'target'] = 'Atypical Appearance'
print(train.target.value_counts())
pie_plot(train, 'target')    

In [None]:
train.boxes.values[0] # x_min, y_min, width, height

In [None]:
train.label.values[0] # x_min, y_min, x_max, y_max

In [None]:
class_names = ['Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance'] # we have 3 positive classes
unique_classes = np.unique(train[class_names].values, axis=0)
unique_classes # no multi label 

# Plot img with bounding box

In [None]:
imgs = []
label2color = {
    '[1, 0, 0]': [255,0,0], # Typical Appearance
    '[0, 1, 0]': [0,255,0], # Indeterminate Appearance
    '[0, 0, 1]': [0,0,255], # Atypical Appearance
    '[0, 0, 0]': None, # negative
}
label2target = {
    '[1, 0, 0]': 'typical',
    '[0, 1, 0]': 'indeterminate',
    '[0, 0, 1]': 'atypical'
}
thickness = 3
scale = 5
FONT = cv2.FONT_HERSHEY_SIMPLEX; FONT_SCALE = 1; FONT_THICKNESS = 2; FONT_LINE_TYPE = cv2.LINE_AA;

for _, row in train[train['Negative for Pneumonia']==0].iloc[:8].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob(f'{dataset_dir}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    

    text_width, text_height = cv2.getTextSize(target, FONT, FONT_SCALE, FONT_THICKNESS)[0]
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
        )
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE)          
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)


# Typical Appearance only

In [None]:
imgs = []
thickness = 3
scale = 5

for _, row in train[train['Typical Appearance'] == 1].iloc[:16].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob(f'{dataset_dir}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

# Indeterminate Appearance only

In [None]:
imgs = []
thickness = 3
scale = 5

for _, row in train[train['Indeterminate Appearance'] == 1].iloc[:16].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob(f'{dataset_dir}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

# Atypical Appearance only

In [None]:
imgs = []
thickness = 3
scale = 5

for _, row in train[train['Atypical Appearance'] == 1].iloc[:16].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob(f'{dataset_dir}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]

    bboxes = []
    bbox = []
    
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

# heatmap

In [None]:
train_with_size = pd.read_csv('../input/siim-external/train.csv') # present for you
train_with_size = train_with_size[train_with_size['Negative for Pneumonia'] == 0]
train_with_size['class_id'] = 0
train_with_size.loc[train_with_size['Indeterminate Appearance'] == 1, 'class_id'] = 1
train_with_size.loc[train_with_size['Atypical Appearance'] == 1, 'class_id'] = 2
train_with_size.head()

In [None]:
num_2_label = {0: 'Typical Appearance', 1: 'Indeterminate Appearance', 2: 'Atypical Appearance'}
HEATMAP_SIZE = (int(train_with_size.height.mean()), int(train_with_size.width.mean()), 3)

# Initialize
heatmap = np.zeros((HEATMAP_SIZE), dtype=np.int16)
bbox_np = train_with_size[["class_id", "scaled_x_min", "scaled_x_max", "scaled_y_min", "scaled_y_max"]].to_numpy()
bbox_np[:, 1:3] *= int(train_with_size.width.mean())
bbox_np[:, 3:5] *= int(train_with_size.height.mean())
bbox_np = np.floor(bbox_np).astype(np.int16)

# Color map stuff
custom_cmaps = [
    matplotlib.colors.LinearSegmentedColormap.from_list(
        colors=[(0.,0.,0.), c, (0.95,0.95,0.95)], 
        name=f"custom_{i}") for i,c in enumerate(sns.color_palette("Spectral", 4))
]

for row in tqdm(bbox_np, total=bbox_np.shape[0]):
    heatmap[row[3]:row[4]+1, row[1]:row[2]+1, row[0]] += 1
    
fig = plt.figure(figsize=(20,25))
plt.suptitle("Heatmaps Showing Bounding Box Placement\n ", fontweight="bold", fontsize=16)
for i in range(4):
    plt.subplot(4, 4, i+1)
    if i==0:
        plt.imshow(heatmap.mean(axis=-1), cmap="bone")
        plt.title(f"Average of All Classes", fontweight="bold")
    else:
        plt.imshow(heatmap[:, :, i-1], cmap=custom_cmaps[i-1])
        plt.title(num_2_label[i-1], fontweight="bold")
        
    plt.axis(False)
fig.tight_layout(rect=[0, 0.03, 1, 0.97])
plt.show()

left-right difference in indeterminate & atypical?

# box area distribution

In [None]:
# We need to compare on the same scale.
train_with_size['area'] = (train_with_size['scaled_x_max'] - train_with_size['scaled_x_min']) * (train_with_size['scaled_y_max'] - train_with_size['scaled_y_min'])

In [None]:
plt.figure(figsize=(12,5))
plt.title('typical only: distribution of box area (max 1)')
ax = sns.distplot(train_with_size[train_with_size['Typical Appearance']==1]['area'])
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title('indeterminate only: distribution of box area (max 1)')
ax = sns.distplot(train_with_size[train_with_size['Indeterminate Appearance']==1]['area'])
plt.show()

In [None]:
plt.figure(figsize=(12,5))
plt.title('atypical only: distribution of box area (max 1)')
ax = sns.distplot(train_with_size[train_with_size['Atypical Appearance']==1]['area'])
plt.show()

・As we can see in the heat map, the 'typical' seems to have the largest area.  
・In atypical, small boxes are in the majority.  
Does the box get bigger as the annotator confidently labels typical examples?  
Or does the range get larger because the typical example has a more advanced disease state?  
I don't know the truth, but it's fun to imagine the annotation process 😌  

Updated:
The following url describes annotations.  
https://www.kaggle.com/c/siim-covid19-detection/discussion/240250

# plot small boxes only

In [None]:
small_images = train_with_size.query('x_min != 0').sort_values('area').image_id.values[:16]
train['image_id'] = train['id'].apply(lambda x: x.replace('_image', ''))

In [None]:
imgs = []

for _, row in train[train.image_id.isin(small_images)].iterrows():
    study_id = row['StudyInstanceUID']
    img_path = glob(f'{dataset_dir}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]
    target = label2target[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    

    text_width, text_height = cv2.getTextSize(target, FONT, FONT_SCALE, FONT_THICKNESS)[0]
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
        )
        box_width = int(box[2]) - int(box[0])
        img = cv2.putText(img, target, (int(box[0])-(text_width-box_width)//2, int(box[1])-10),
                        FONT, FONT_SCALE, color, FONT_THICKNESS, FONT_LINE_TYPE)          
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

difficult... give me the doctor's eyes 😡

# compare negative / typical

In [None]:
imgs = []
for i in range(32):
    if i % 2 == 0:
        row = train[train['Negative for Pneumonia']==1].iloc[i]
    else:
        row = train[train['Typical Appearance']==1].iloc[i]
        
    study_id = row['StudyInstanceUID']
    img_path = glob(f'{dataset_dir}/train/{study_id}/*/*')[0]
    img = dicom2array(path=img_path)
    img = cv2.resize(img, None, fx=1/scale, fy=1/scale)
    img = np.stack([img, img, img], axis=-1)
    
    claz = row[class_names].values
    color = label2color[str(claz.tolist())]

    bboxes = []
    bbox = []
    for i, l in enumerate(row['label'].split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l)/scale)
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []    
    
    for box in bboxes:
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            color, thickness
    )
    img = cv2.resize(img, (500,500))
    imgs.append(img)
    
plot_imgs(imgs, cmap=None)

Let's learn to recognize the positive samples 💪

# plot from submission.csv

In [None]:
CONF_THRESHOLD = 0.3

def pred_str_to_confidence_box_maps(pred_str):
    confidence_box_maps = []

    for i, pred in enumerate(pred_str.split(' ')):
        if i % 6 == 0:
            confidence_box_map = {}
            box = []
        elif i % 6 == 1:
            conf = float(pred)
            confidence_box_map['conf'] = conf
    #         print(confidence_box_map)
        else:
            box.append(int(float(pred)))
        if i % 6 == 5:
            confidence_box_map['box'] = box
            if conf > CONF_THRESHOLD:
                confidence_box_maps.append(confidence_box_map)
    return confidence_box_maps

def add_bbox(img, confidence_box_maps):
    for confidence_box_map in confidence_box_maps:
        conf = confidence_box_map['conf']
        box = confidence_box_map['box']
        text = f'{round(conf, 4)}'
        img = cv2.rectangle(
            img,
            (int(box[0]), int(box[1])),
            (int(box[2]), int(box[3])),
            [255,0,0], thickness
        )    
        box_width = int(box[2]) - int(box[0])
        if img.shape[0] > 2000:
#         if True:
            font_scale = int(7 * img.shape[0] / 4000)
            font_tickness = int(20 * img.shape[0] / 4000)
        else:
            font_scale = int(7 * img.shape[0] / 2000)
            font_tickness = int(20 * img.shape[0] / 2000)            
        text_width, text_height = cv2.getTextSize(text, FONT, font_scale, font_tickness)[0]
#         text_width, text_height, font_scale, font_tickness = 677, 135, 6, 17,
        img = cv2.putText(img, text, (int(box[0])-(text_width-box_width)//2, int(box[1])-40),
                        FONT, font_scale, [255,0,0], font_tickness, FONT_LINE_TYPE)              
    return img

def print_study_conf(sub, image_id):
    study_id = glob(f'../input/siim-covid19-detection/test/*/*/{image_id}.dcm')[0].split('/')[-3]
    study_pred_str = sub[sub['id'].str.startswith(study_id)].PredictionString.values[0]
    names, confs = [], []
    for i, pred in enumerate(study_pred_str.split(' ')):
        if i % 6 == 0:
            names.append(pred)
        if i % 6 == 1:
            confs.append(pred)
    print('<↓study conficence↓>')
    for name, conf in zip(names, confs):
        print(f'{name} confidence: {conf}')


def plot_with_pred(sub, image_id):
    print_study_conf(sub, image_id)
    path = glob(f'../input/siim-covid19-detection/test/*/*/{image_id}.dcm')[0]
    img = dicom2array(path)
    pred_str = sub[sub['id'].str.startswith(image_id)].PredictionString.values[0]
    confidence_box_maps = pred_str_to_confidence_box_maps(pred_str)
    if len(confidence_box_maps) == 0:
        print(f'{image_id}: There are no boxes with confidence greater than {CONF_THRESHOLD}.')
    img = add_bbox(img, confidence_box_maps)
    plot_img(img)


In [None]:
part_of_my_sub = pd.read_csv('../input/test-subs/part_of_submission.csv')  # use your submission.csv
part_of_my_sub

In [None]:
image_ids = ['d5911a060ee4', '25b281d5a9f3', '03a778f5a68b', '2aaab6a41f1a', '4f7f40e478b1', '88782677cbec']
for image_id in image_ids:
    plot_with_pred(part_of_my_sub, image_id) # input: submission.csv, image_id

## 😊 Please upvote if you found this helpful 😊