In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input/siim-covid19-detection/train'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

import torch 
import cv2
import torch.nn as nn
import glob
import pydicom
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from pydicom import dcmread
import ast
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## In this notebook, I will apply the following method to solve the problem.
###    1- Join the two train csv
###    2- Have a look at the annotated boxes and what class is inside
By collecting the boxes, find the interesting part of the photos for the analysis.
Select and keep this parts of the image, and then mirror the right image to get a similar support.

###    3- Convert every train dcm images to np.array
###    4- Separate and mirror the right lung.
###    5- First try transfer learning with ResNet to classify the images.
###    6- If the performance is good enough with the training and validation dataset, I will create a test dataset by clipping the test images. 
    


# 0- Import the Data

Files

    train_study_level.csv - the train study-level metadata, with one row for each study, including correct labels.
    train_image_level.csv - the train image-level metadata, with one row for each image, including both correct labels and any bounding boxes in a dictionary format. Some images in both test and train have multiple bounding boxes.
    sample_submission.csv - a sample submission file containing all image- and study-level IDs.

train_study_level.csv

    id - unique study identifier
    Negative for Pneumonia - 1 if the study is negative for pneumonia, 0 otherwise
    Typical Appearance - 1 if the study has this appearance, 0 otherwise
    Indeterminate Appearance - 1 if the study has this appearance, 0 otherwise
    Atypical Appearance - 1 if the study has this appearance, 0 otherwise

train_image_level.csv

    id - unique image identifier
    boxes - bounding boxes in easily-readable dictionary format
    label - the correct prediction label for the provided bounding boxes

Path


In [None]:
DIR_PATH = "../input/siim-covid19-detection"
train_img_path = f"{DIR_PATH}/train_image_level.csv"
train_stdy_path = f"{DIR_PATH}/train_study_level.csv"
train_path = f"{DIR_PATH}/train"

#loading csv file using pandas 

train_img_df = pd.read_csv(train_img_path)
train_stdy_df = pd.read_csv(train_stdy_path)
#train_df.sample(5)



Lets have a look to the data

In [None]:
train_img_df.head(5)

In [None]:
train_stdy_df.head(5)

# 1- Join the two training dataframes on the StudyInstanceUID.

In [None]:
#Remove the study suffix
train_stdy_df['StudyInstanceUID'] = train_stdy_df['id'].apply(lambda x: x.replace('_study', ''))
train_img_df['id'] = train_img_df['id'].apply(lambda x: x.replace('_image', ''))


In [None]:
train_df = train_img_df.merge(train_stdy_df, on='StudyInstanceUID')

In [None]:
train_df.head()

# 2- First focus on the boxes

A few functions to convert à visualize the images

In [None]:
import numpy as np
import pydicom
import re
from pydicom.pixel_data_handlers.util import apply_voi_lut
from pydicom.data import get_testdata_file

import matplotlib.pyplot as plt
%matplotlib inline

# Convert the dcm image to np.array
def dicom2array(path, voi_lut = True, fix_monochrome = True):
    
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data
    
# plot some images
def plot_images(imgs, cols=4, size=7, is_rgb=True, title="", cmap='gray', img_size=(500,500)):
    rows = len(imgs)//cols + 1
    fig = plt.figure(figsize=(cols*size, rows*size))
    for i, img in enumerate(imgs):
        if img_size is not None:
            img = cv2.resize(img, img_size)
        fig.add_subplot(rows, cols, i+1)
        plt.imshow(img, cmap=cmap)
    plt.suptitle(title)
    plt.show()
    
# Plot a single image
def plot_image(img, title="", figsize=(8,8), cmap=None):
    plt.figure(figsize=figsize)
    
    if cmap:
        plt.imshow(img, cmap=cmap)
    else:
        img
        plt.imshow(img)
        
    plt.title(title, fontweight="bold")
    plt.axis(False)
    plt.show()
    
def get_image_id(path):
    """ Function to return the image-id from a path """
    return path.rsplit("/", 1)[1].rsplit(".", 1)[0]

def get_coordinates_boxes(box):
    # separate boxes
    l_box = box.split("}, {")

    # Read the numeric values
    df_coord=pd.DataFrame([re.findall(r' [0-9]+\.*',lb) for lb in l_box], columns=['x','y','width','height'], dtype=float)
    
    return df_coord

def get_rows_boxes(box, infos_img) :
    a_box = get_coordinates_boxes(box)
    #Calculate the relative coord (%)
    a_box['x_rel'] = 100.*a_box['x']/infos_img[0]
    a_box['y_rel'] = 100.*a_box['y']/infos_img[1]
    a_box['x_rel_max'] = 100.*(a_box['x']+a_box['width'])/infos_img[0]
    a_box['y_rel_max'] = 100.*(a_box['y']+a_box['height'])/infos_img[1]
    a_box['width_rel'] = 100.*a_box['width']/infos_img[0]
    a_box['height_rel'] = 100.*a_box['height']/infos_img[1]
    a_box['img_width'] = infos_img[0]
    a_box['img_height'] = infos_img[1]
    a_box['img_id'] = infos_img[2]
    a_box['study_id'] = infos_img[3]
    a_box['img_path'] = infos_img[4]
    #print(a_box)
    return a_box
  

def draw_bboxes(img, tl, br, rgb, label="", label_location="tl", opacity=0.1, line_thickness=0):
    """ TBD 
    
    Args:
        TBD
        
    Returns:
        TBD 
    """
    rect = np.uint8(np.ones((br[1]-tl[1], br[0]-tl[0], 3))*rgb)
    sub_combo = cv2.addWeighted(img[tl[1]:br[1],tl[0]:br[0],:], 1-opacity, rect, opacity, 1.0)    
    img[tl[1]:br[1],tl[0]:br[0],:] = sub_combo

    if line_thickness>0:
        img = cv2.rectangle(img, tuple(tl), tuple(br), rgb, line_thickness)
        
    if label:
        # DEFAULTS
        FONT = cv2.FONT_HERSHEY_SIMPLEX
        FONT_SCALE = 1.666
        FONT_THICKNESS = 3
        FONT_LINE_TYPE = cv2.LINE_AA
        
        if type(label)==str:
            LABEL = label.upper().replace(" ", "_")
        else:
            LABEL = f"CLASS_{label:02}"
        
        text_width, text_height = cv2.getTextSize(LABEL, FONT, FONT_SCALE, FONT_THICKNESS)[0]
        
        label_origin = {"tl":tl, "br":br, "tr":(br[0],tl[1]), "bl":(tl[0],br[1])}[label_location]
        label_offset = {
            "tl":np.array([0, -10]), "br":np.array([-text_width, text_height+10]), 
            "tr":np.array([-text_width, -10]), "bl":np.array([0, text_height+10])
        }[label_location]
        img = cv2.putText(img, LABEL, tuple(label_origin+label_offset), 
                          FONT, FONT_SCALE, rgb, FONT_THICKNESS, FONT_LINE_TYPE)
    
    return img

Infos about the dcm image.
How to get access to the patient id for example.

In [None]:
row_0 = train_df.iloc[0]
dcm_path = "{}/{}/*/{}.dcm".format(train_path,row_0['StudyInstanceUID'], row_0['id_x'])
for filename in glob.glob(dcm_path):
    print(filename)
    img_array = dicom2array(filename)
    ds = pydicom.dcmread(filename)
    print(ds)

## Create new dataframe with every box

In [None]:
l_boxes = []
l_images_no_box = []
boxes_df = pd.DataFrame(l_boxes, columns=['x', 'y', 'width', 'height', 'x_rel', 'x_rel_max', 
                                          'y_rel', 'y_rel_max','width_rel', 'height_rel', 
                                          'img_width', 'img_height','img_id', 'study_id',
                                          'img_path'])



for i, row in train_df.iloc[:].iterrows():
    dcm_path = "{}/{}/*/{}.dcm".format(train_path,row['StudyInstanceUID'], row['id_x'])
    #print(i)
    for filename in glob.glob(dcm_path):
        ds = pydicom.dcmread(filename)
        
    infos_img = [ds.Columns, ds.Rows, row['id_x'], row['StudyInstanceUID'], filename]
    box = str(row['boxes'])
    #l_except = [771]
    if box != 'nan'  :
        box_df = get_rows_boxes(box, infos_img)
        #print(box_df.head())
        boxes_df = pd.concat([boxes_df, box_df], ignore_index=True)
    else :
        l_images_no_box.append(infos_img)

no_boxes_df = pd.DataFrame(l_images_no_box, columns=['img_width', 'img_height',
                                                     'img_id', 'study_id','img_path'])
# Save the df in a csv 
boxes_df.to_csv('/kaggle/working/boxes_df.csv')
no_boxes_df.to_csv('/kaggle/working/no_boxes_df.csv')

Samples of chest images.

In [None]:
# Save the df in a csv 
boxes_df = pd.read_csv('../input/boxes-infos/boxes_df.csv')
no_boxes_df = pd.read_csv('../input/boxes-infos/no_boxes_df.csv')

## Show the values distribution
I worked with relative values, x and y are the pourcentage of the width or height of the image.


In [None]:
# some random data
x_min = boxes_df['x_rel'].to_numpy()
y_min = boxes_df['y_rel'].to_numpy()
x_max = boxes_df['x_rel_max'].to_numpy()
y_max = boxes_df['y_rel_max'].to_numpy()

def scatter_hist_1d(x_1, y_1, ax, ax_histx, ax_histy, color = 'b'):
    # no labels
    ax_histx.tick_params(axis="x", labelbottom=False)
    ax_histy.tick_params(axis="y", labelleft=False)

    # the scatter plot:
    ax.scatter(x_1, y_1, c = color)

    # now determine nice limits by hand:
    binwidth = 2
    #xymax = max(np.max(np.abs(x)), np.max(np.abs(y)))
    #lim = (int(xymax/binwidth) + 1) * binwidth

    bins = np.arange(0, 100+1, binwidth)
    ax_histx.hist(x_1, bins=bins)
    ax_histy.hist(y_1, bins=bins, orientation='horizontal')

In [None]:
# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
spacing = 0.005


rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom + height + spacing, width, 0.2]
rect_histy = [left + width + spacing, bottom, 0.2, height]

# start with a square Figure
fig = plt.figure(figsize=(15, 15))

ax = fig.add_axes(rect_scatter)
ax_histx = fig.add_axes(rect_histx, sharex=ax)
ax_histy = fig.add_axes(rect_histy, sharey=ax)

# use the previously defined function
scatter_hist_1d(x_min, y_min,  ax, ax_histx, ax_histy)
scatter_hist_1d(x_max, y_max, ax, ax_histx, ax_histy, color='r')

plt.show()

In [None]:
import matplotlib.patches as patches

fig = plt.figure(figsize=(15, 15))
fig, ax = plt.subplots()


for i in range(x_min.shape[0]) :
    ax.add_patch(
         patches.Rectangle(
            (x_min[i], y_min[i]),
            x_max[i] - x_min[i],
            y_max[i] - y_min[i],
            edgecolor = 'blue',
            facecolor = 'red',
            fill=True
         ) )

plt.show()

### At first, I thought it possible to focus on each lung and also to resize the image to keep only its useful part
### It seems the whole image is used by the annotation boxes.

### following that, here are represented the dimensions of the boxes

In [None]:
delta_x = x_max - x_min
delta_y = y_max - y_min

# start with a square Figure
fig = plt.figure(figsize=(15, 15))

ax = fig.add_axes(rect_scatter)
ax_histx = fig.add_axes(rect_histx, sharex=ax)
ax_histy = fig.add_axes(rect_histy, sharey=ax)

# use the previously defined function
scatter_hist_1d(delta_x, delta_y, ax, ax_histx, ax_histy)

plt.show()