In [2]:
!pip install imgaug

Defaulting to user installation because normal site-packages is not writeable
Collecting imgaug
  Using cached imgaug-0.4.0-py2.py3-none-any.whl (948 kB)
Installing collected packages: imgaug
Successfully installed imgaug-0.4.0


In [3]:
import os
import gc
import sys
import time
import json
import glob
import random
from pathlib import Path
import pandas as pd

from PIL import Image
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imgaug import augmenters as iaa

import itertools
from tqdm import tqdm
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut

import warnings 
warnings.filterwarnings("ignore")

In [4]:
training_folder = "/scratch/kbylapud/Vindr_local/vinbd_dicom/train/"
df = pd.read_csv("/scratch/kbylapud/Vindr_local/vinbd_dicom/train.csv")
df = df.query("class_id<14")
df = df.query("rad_id=='R9'")

In [5]:
df["several_issues"] = df.duplicated(subset=['image_id'])
df["box_size"] = [(row.y_max-row.y_min)*(row.x_max-row.x_min) for idx, row in df.iterrows()]

In [6]:
df.class_name.unique()

array(['Pleural thickening', 'Nodule/Mass', 'Pulmonary fibrosis',
       'Atelectasis', 'Aortic enlargement', 'Other lesion',
       'Lung Opacity', 'ILD', 'Cardiomegaly', 'Infiltration',
       'Pleural effusion', 'Consolidation', 'Calcification',
       'Pneumothorax'], dtype=object)

In [7]:
df.groupby("class_id")["box_size"].mean()

class_id
0     113737.328217
1     402822.769231
2      88929.848993
3     405252.227218
4     301123.621891
5     601350.573883
6     378289.608200
7     240435.082534
8      62540.092182
9     221830.790937
10    237380.216268
11     52477.879562
12    846184.200000
13    151448.671599
Name: box_size, dtype: float64

In [9]:
selected_classes = [0,1,2,3,4,5,6,7,8,9,10,11,12,13]
#category_list = ["Aortic enlargement", "Cardiomegaly", "ILD", "Lung Opacity", "Pleural effusion"]
category_list = ['Pleural thickening', 'Nodule/Mass', 'Pulmonary fibrosis',
       'Atelectasis', 'Aortic enlargement', 'Other lesion',
       'Lung Opacity', 'ILD', 'Cardiomegaly', 'Infiltration',
       'Pleural effusion', 'Consolidation', 'Calcification',
       'Pneumothorax']
filtered_df = df.query("class_id in @selected_classes")

In [10]:
selected_classes_dict = {"0":0,"1":1,"2":2,"3":3,"4":4,"5":5,"6":6,"7":7,"8":8,"9":9,"10":10,"11":11,"12":12,"13":13}
filtered_df["reclass_id"] = [selected_classes_dict[str(row.class_id)] for idx, row in filtered_df.iterrows()]

In [11]:
filtered_df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,several_issues,box_size,reclass_id
5,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,False,24320.0,11
7,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,False,15576.0,8
9,afb6230703512afc370f236e8fe98806,Pulmonary fibrosis,13,R9,1857.0,1607.0,2126.0,2036.0,False,115401.0,13
11,18a61a07e6f5f13ebfee57fa36cd8b6f,Pulmonary fibrosis,13,R9,393.0,283.0,822.0,643.0,False,154440.0,13
17,80caa435b6ab5edaff4a0a758ffaec6e,Atelectasis,1,R9,331.0,462.0,1384.0,2365.0,False,2003859.0,1
...,...,...,...,...,...,...,...,...,...,...,...
67885,f4da0959856bcdd4b91e0c6949b109e3,Aortic enlargement,0,R9,1567.0,827.0,1830.0,1124.0,False,78111.0,0
67889,d59d5dcc1601a29509f91dab5f8550bc,Lung Opacity,7,R9,1678.0,1097.0,1824.0,1434.0,True,49202.0,7
67897,857b9d89572e03adb17d0630b33709ea,Pleural thickening,11,R9,1314.0,494.0,1815.0,825.0,True,165831.0,11
67903,b53d1dd80e99ca6bcef9d592f65d3321,Pleural effusion,10,R9,240.0,1550.0,562.0,2001.0,True,145222.0,10


In [12]:
def get_mask(img_dimensions, x_min, y_min, x_max, y_max):
    img_height, img_width = img_dimensions
    img_mask = np.full((img_height,img_width),0)
    img_mask[y_min:y_max,x_min:x_max] = 255
    
    return img_mask.astype(np.float32)


def rle_encoding(x):
    dots = np.where(x.T.flatten() == 255)[0]
    run_lengths = []
    prev = -2
    for b in dots:
        if (b>prev+1): run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return ' '.join([str(x) for x in run_lengths])

In [13]:
def read_xray(path, voi_lut = True, fix_monochrome = True):
    dicom = pydicom.read_file(path)
    
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array
               
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
        
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
        
    return data

In [17]:
resized_folder = "/scratch/kbylapud/Vindr_local/vinbd_dicom/resized_train/"
os.mkdir(resized_folder)

In [14]:
filtered_df.groupby("class_id").image_id.count()

class_id
0     2541
1       78
2      298
3     1668
4      201
5      291
6      439
7     1042
8      857
9      971
10    1045
11    2466
12      75
13    1757
Name: image_id, dtype: int64

In [30]:
# balanced_filtered_df = pd.DataFrame()
# samples_per_class = 500
# for class_name in filtered_df.class_name.unique():
#     balanced_filtered_df = balanced_filtered_df.append(filtered_df.query("class_name==@class_name")[:samples_per_class], 
#                                                        ignore_index=True)

In [31]:
balanced_filtered_df = filtered_df[0:100]

In [32]:
balanced_filtered_df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,several_issues,box_size,reclass_id
5,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,False,24320.0,11
7,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,False,15576.0,8
9,afb6230703512afc370f236e8fe98806,Pulmonary fibrosis,13,R9,1857.0,1607.0,2126.0,2036.0,False,115401.0,13
11,18a61a07e6f5f13ebfee57fa36cd8b6f,Pulmonary fibrosis,13,R9,393.0,283.0,822.0,643.0,False,154440.0,13
17,80caa435b6ab5edaff4a0a758ffaec6e,Atelectasis,1,R9,331.0,462.0,1384.0,2365.0,False,2003859.0,1
...,...,...,...,...,...,...,...,...,...,...,...
423,61dd944679b8bfd6d5f5a84c9e7738e3,Aortic enlargement,0,R9,1123.0,756.0,1416.0,1107.0,False,102843.0,0
426,b5822471fa3aef526081f6a64d7bec2f,Pulmonary fibrosis,13,R9,233.0,2310.0,388.0,2389.0,False,12245.0,13
429,317507bb22f6c10f4f08e8666c0f6ff8,Aortic enlargement,0,R9,1139.0,825.0,1419.0,1067.0,False,67760.0,0
440,90f4166c777d2ecf85440797c6ee774c,Aortic enlargement,0,R9,1376.0,729.0,1616.0,914.0,False,44400.0,0


In [33]:
diagnostic_per_image = []

image_size=512
with tqdm(total=len(balanced_filtered_df)) as pbar:
    for idx,row in balanced_filtered_df.iterrows():
        image_id = row.image_id
        image_df = balanced_filtered_df.query("image_id==@image_id")
        class_list = []
        RLE_list = []
        
        for diagnostic_id, diagnostic in image_df.iterrows():
            class_list.append(diagnostic.reclass_id)

            dicom_image = read_xray(training_folder+image_id+".dicom")
            image_dimensions = dicom_image.shape
            #print(image_dimensions)
            resized_img = cv2.resize(dicom_image, (image_size,image_size), interpolation = cv2.INTER_AREA)
            cv2.imwrite(resized_folder+image_id+".jpg", resized_img) 
            
            mask = get_mask(image_dimensions, int(diagnostic.x_min), int(diagnostic.y_min), int(diagnostic.x_max), int(diagnostic.y_max))
            resized_mask = cv2.resize(mask, (image_size,image_size))
            RLE_list.append(rle_encoding(resized_mask))
        diagnostic_per_image.append({"image_id":image_id,
                                     "CategoryId":class_list,
                                     "EncodedPixels":RLE_list})
        pbar.update(1)

100%|██████████| 100/100 [01:16<00:00,  1.31it/s]


In [None]:
samples_df = pd.DataFrame(diagnostic_per_image)
samples_df["Height"] = image_size
samples_df["Width"] = image_size