In [1]:
!pip install -q tensorflow-io
!pip install ensemble-boxes

Collecting ensemble-boxes
  Downloading ensemble_boxes-1.0.4-py3-none-any.whl (14 kB)
Installing collected packages: ensemble-boxes
Successfully installed ensemble-boxes-1.0.4


In [2]:
import pandas as pd
import numpy as np
import os
import cv2
from tqdm import tqdm

import tensorflow as tf
import tensorflow_io as tfio
from ensemble_boxes import *

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns


In [3]:
df=pd.read_csv(r'../input/vinbigdata-chest-xray-abnormalities-detection/train.csv')
df=df[df['class_id']!=14]
df["class_id"] = df["class_id"]

images=df.image_id.unique()

print(f'Total records: {len(df)}')
print(f'Number of images: {len(images)}')

Total records: 36096
Number of images: 4394


In [4]:
# Creating LabelMap
LabelMap = df.loc[df["class_name"] != "No finding", ["class_name", "class_id"]] # Removing the examples with no finding
LabelMap = LabelMap.drop_duplicates().reset_index(drop = True)

LabelMap

Unnamed: 0,class_name,class_id
0,Cardiomegaly,3
1,Aortic enlargement,0
2,Pleural thickening,11
3,ILD,5
4,Nodule/Mass,8
5,Pulmonary fibrosis,13
6,Lung Opacity,7
7,Atelectasis,1
8,Other lesion,9
9,Infiltration,6


In [7]:
path=r'../input/vinbigdata-chest-xray-abnormalities-detection/train/'

for image in tqdm(images):

    image_bytes = tf.io.read_file(path+image+'.dicom')
    img = tfio.image.decode_dicom_image(image_bytes, dtype = tf.uint16)
    
    img = tf.squeeze(img, axis = 0)

    h,w,_ = img.shape

    tmp=df[df['image_id']==image]

    tmp[["x_min", "x_max"]] = tmp[["x_min", "x_max"]]/w
    tmp[["y_min", "y_max"]] = tmp[["y_min", "y_max"]]/h

    boxes_list = tmp[["x_min", "y_min", "x_max", "y_max"]].values.tolist()
    scores_list = [1]*len(boxes_list)
    labels_list = list(tmp["class_id"])

    # Applying WBF
    boxes, _, labels = weighted_boxes_fusion(boxes_list = [boxes_list],
                                             scores_list = [scores_list],
                                             labels_list = [labels_list],
                                             weights = None, 
                                             iou_thr = 0.5, 
                                             skip_box_thr = 0.0001)
    
    tmp_reduced = pd.DataFrame(boxes, columns = ["x_min", "y_min", "x_max", "y_max"])
    tmp_reduced[["x_min", "x_max"]] = tmp_reduced[["x_min", "x_max"]]*w
    tmp_reduced[["y_min", "y_max"]] = tmp_reduced[["y_min", "y_max"]]*h
    tmp_reduced['image_id']=image
    tmp_reduced['height']=h
    tmp_reduced['width']=w
    tmp_label = pd.DataFrame(labels, columns = ["class_id"])
    
    tmp_reduced = pd.concat([tmp_reduced,tmp_label],axis=1)
    tmp_reduced = tmp_reduced.merge(LabelMap,on = "class_id", how="left")

    if "df_reduce" in globals():
        df_reduce=pd.concat([df_reduce,tmp_reduced],axis=0)
    else:
        df_reduce=tmp_reduced.copy(deep=True)


100%|██████████| 4394/4394 [1:54:51<00:00,  1.57s/it]  


In [None]:
df_reduce = df_reduce.reset_index(drop=True)

df_reduce = df_reduce.astype({"x_min": int, 
                              "y_min": int, 
                              "x_max": int, 
                              "y_max": int,
                              "height": int,
                              "width": int,
                              "class_id": str})

In [None]:
import random
def list_color(class_list):
    dict_color = dict()
    for classid in class_list:
        dict_color[classid] = [i/256 for i in random.sample(range(256), 3)]
    
    return dict_color

In [None]:
def display_image(image):
    
    dict_color = list_color(range(15))
   
    df_sample = df[df["image_id"]==image].reset_index(drop=True)
    
    df_sample_wbf = df_reduce[df_reduce["image_id"]==image].reset_index(drop=True)
    
    print(f'Raw data: {df_sample.class_name.count()}')
    print(f'After WBF: {df_sample_wbf.class_name.count()}')
    
    image_bytes = tf.io.read_file(path+image+'.dicom')
    img = tfio.image.decode_dicom_image(image_bytes, dtype = tf.uint16)
    
    img = tf.squeeze(img, axis = 0)

    h,w,_ = img.shape
    
    fig, ax = plt.subplots(1,2, figsize=(15, 15))
    
    ax[0].imshow(img.numpy(), plt.cm.bone)
    for i in range(len(df_sample)):
        x_min = df_sample.iloc[i,4]
        x_max = df_sample.iloc[i,5]
        y_max = df_sample.iloc[i,6]
        y_min = df_sample.iloc[i,7]
        class_id = df_sample.iloc[i,2]
        class_name = df_sample.iloc[i,1]
        
        rect = patches.Rectangle((x_min, y_min), x_max-x_min, y_max-y_min,
                             linewidth=1, edgecolor=dict_color[class_id], facecolor='none')
        ax[0].add_patch(rect)
        ax[0].text(x_min, y_min, class_name, fontsize=15, color='red')
        
    ax[0].title.set_text(image)
    
    ax[1].imshow(img.numpy(), plt.cm.bone)
    for i in range(len(df_sample_wbf)):
        x_min = df_sample_wbf.iloc[i,0]
        x_max = df_sample_wbf.iloc[i,1]
        y_max = df_sample_wbf.iloc[i,2]
        y_min = df_sample_wbf.iloc[i,3]
        class_id = int(df_sample_wbf.iloc[i,7][:-2])
        class_name = df_sample_wbf.iloc[i,8]
        
        rect = patches.Rectangle((x_min, y_min), x_max-x_min, y_max-y_min,
                             linewidth=1, edgecolor=dict_color[class_id], facecolor='none')
        ax[1].add_patch(rect)
        ax[1].text(x_min, y_min, class_name, fontsize=15, color='red')
        
    ax[1].title.set_text(image)
    
    plt.show()

In [None]:
display_image(images[9])

In [None]:
df_reduce.to_csv("train.csv", index=False)