In [1]:
import numpy as np
import cv2

import pandas as pd

from tqdm.notebook import tqdm
from ensemble_boxes import weighted_boxes_fusion

In [2]:
INPUT_PATH = 'dfs/train_with_meta.csv'
OUTPUT_PATH = '/home/semyon/data/VinBigData/custom_dfs/weighted_boxes_fusion_iou-0.20.csv'

In [3]:
df = pd.read_csv(INPUT_PATH)

In [4]:
df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,done,new_class,removed,height,width,sex,age,WindowCenter,WindowWidth,RescaleIntercept,RescaleSlope
0,000434271f63a053c4128a0ba6352c7f,No finding,14,R6,,,,,0,No finding,0,2836,2336,O,-1,2047.0,4095.0,0.0,1.0
1,000434271f63a053c4128a0ba6352c7f,No finding,14,R2,,,,,0,No finding,0,2836,2336,O,-1,2047.0,4095.0,0.0,1.0
2,000434271f63a053c4128a0ba6352c7f,No finding,14,R3,,,,,0,No finding,0,2836,2336,O,-1,2047.0,4095.0,0.0,1.0
3,00053190460d56c53cc3e57321387478,No finding,14,R11,,,,,0,No finding,0,2430,1994,O,-1,2047.0,4096.0,0.0,1.0
4,00053190460d56c53cc3e57321387478,No finding,14,R2,,,,,0,No finding,0,2430,1994,O,-1,2047.0,4096.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67909,fff0f82159f9083f3dd1f8967fc54f6a,No finding,14,R9,,,,,0,No finding,0,2500,2048,O,-1,2048.0,4096.0,0.0,1.0
67910,fff0f82159f9083f3dd1f8967fc54f6a,No finding,14,R14,,,,,0,No finding,0,2500,2048,O,-1,2048.0,4096.0,0.0,1.0
67911,fff2025e3c1d6970a8a6ee0404ac6940,No finding,14,R1,,,,,0,No finding,0,2150,1994,O,-1,2047.0,4096.0,0.0,1.0
67912,fff2025e3c1d6970a8a6ee0404ac6940,No finding,14,R5,,,,,0,No finding,0,2150,1994,O,-1,2047.0,4096.0,0.0,1.0


In [5]:
class2id = df[['class_name', 'class_id']].groupby('class_name').mean().to_dict()['class_id']
id2class = {v:k for k,v in class2id.items()}

In [6]:
def get_fused_boxes(image_id, records, only_one=False):

    all_rad_ids = records.groupby('image_id')['rad_id'].agg(lambda x: ' '.join(np.unique(x))).iloc[0]

    if records.groupby('image_id').mean()['class_id'].values[0] == 14:
        tmp = records[['image_id', 'class_name', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max']].copy()
        tmp = tmp.iloc[:1]
        tmp['rad_id'] = all_rad_ids
        return tmp
    
    boxes = records[['x_min', 'y_min', 'x_max', 'y_max']].values
    pix_multiplier = pd.DataFrame([records.width,records.height,records.width,records.height]).T
    boxes = [(boxes/(pix_multiplier)).values.tolist()]
    labels = [records["class_id"].tolist()]
    scores = [[1]*len(records)]
    weights = [1]

    iou_thr = 0.2
    skip_box_thr = 0

    # If we demand only one of the label per image, we set iou threshold to 0
    if only_one:
        boxes, scores, labels = weighted_boxes_fusion(boxes, scores, labels, weights=weights, iou_thr=0, skip_box_thr=skip_box_thr)
    else:
        boxes, scores, labels = weighted_boxes_fusion(boxes, scores, labels, weights=weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)
    boxes = boxes * pix_multiplier.iloc[:len(boxes),:]
    boxes.columns = ['x_min', 'y_min', 'x_max', 'y_max']
    boxes['class_id'] = labels.astype(int)
    boxes['image_id'] = image_id
    boxes['rad_id'] = all_rad_ids
    return boxes

In [7]:
image_ids = df.image_id.unique()

In [8]:
%%time
l = []
for image_id in tqdm(image_ids):
    tmp = df[df.image_id == image_id].copy()
    l.append(get_fused_boxes(image_id, tmp))
    

  0%|          | 0/15000 [00:00<?, ?it/s]

CPU times: user 1min 45s, sys: 1.21 s, total: 1min 46s
Wall time: 1min 45s


In [9]:
new = pd.concat(l).reset_index(drop=True)

In [10]:
new['class_name'] = new.class_id.apply(lambda x: id2class[x])

In [11]:
new = new[['image_id', 'class_name', 'class_id', 'rad_id', 'x_min', 'y_min',
       'x_max', 'y_max']]

In [12]:
new.to_csv(OUTPUT_PATH, index=False)

In [13]:
new

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,000434271f63a053c4128a0ba6352c7f,No finding,14,R2 R3 R6,,,,
1,00053190460d56c53cc3e57321387478,No finding,14,R11 R2 R7,,,,
2,0005e8e3701dfb1dd93d53e2ff537b6e,Consolidation,4,R10 R8 R9,932.000000,567.000000,1197.000000,896.000000
3,0005e8e3701dfb1dd93d53e2ff537b6e,Infiltration,6,R10 R8 R9,900.000000,587.000000,1205.000000,888.000000
4,0005e8e3701dfb1dd93d53e2ff537b6e,Nodule/Mass,8,R10 R8 R9,932.000000,567.000000,1197.000000,896.000000
...,...,...,...,...,...,...,...,...
35433,ffe6f9fe648a7ec29a50feb92d6c15a4,Cardiomegaly,3,R10 R8 R9,1144.666603,1832.000033,2302.999852,2359.666682
35434,ffea246f04196af602c7dc123e5e48fc,No finding,14,R13 R4 R6,,,,
35435,ffeffc54594debf3716d6fcd2402a99f,Aortic enlargement,0,R10 R8 R9,1254.999926,755.999973,1566.999964,1155.333332
35436,fff0f82159f9083f3dd1f8967fc54f6a,No finding,14,R14 R8 R9,,,,
