In [3]:
import numpy as np
import cv2

import pandas as pd

from tqdm.notebook import tqdm
from ensemble_boxes import weighted_boxes_fusion

def get_fused_boxes(image_id, records, conf_col_name=None, iou_thr = 0.2, skip_box_thr = 0, only_one=False):

    all_rad_ids = records.groupby('image_id')['rad_id'].agg(lambda x: ' '.join([str(i) for i in np.unique(x)])).iloc[0]

    if records.groupby('image_id').mean()['class_id'].values[0] == 14:
        tmp = records[['image_id', 'class_name', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max']].copy()
        tmp = tmp.iloc[:1]
        tmp['rad_id'] = all_rad_ids
        return tmp
    
    boxes = records[['x_min', 'y_min', 'x_max', 'y_max']].values
    pix_multiplier = pd.DataFrame([records.width,records.height,records.width,records.height]).T
    boxes = [(boxes/(pix_multiplier)).values.tolist()]
    labels = [records["class_id"].tolist()]
    scores = [[1]*len(records)]
    if conf_col_name is not None:
        scores = [records[conf_col_name].tolist()]
    weights = [1]

    # If we demand only one of the label per image, we set iou threshold to 0
    if only_one:
        boxes, scores, labels = weighted_boxes_fusion(boxes, scores, labels, weights=weights, iou_thr=0, skip_box_thr=skip_box_thr)
    else:
        boxes, scores, labels = weighted_boxes_fusion(boxes, scores, labels, weights=weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)
    boxes = boxes * pix_multiplier.iloc[:len(boxes),:]
    boxes.columns = ['x_min', 'y_min', 'x_max', 'y_max']
    boxes['class_id'] = labels.astype(int)
    boxes['image_id'] = image_id
    boxes['rad_id'] = all_rad_ids
    boxes['conf'] = scores
    if conf_col_name == 'rad_id':
        boxes['rad_id'] = scores
    return boxes

def ensemble_bboxes(input_path, output_path=None, iou_threshold=0.4, meta_path='/home/semyon/data/VinBigData/train_meta.csv', verbose=False):
    if isinstance(input_path, pd.DataFrame):
        df = input_path.copy()
    else:
        df = pd.read_csv(input_path)
    
    if output_path is None:
        output_path = input_path + '_bboxes_fusion_iou-{}.csv'.format(iou_threshold)
    
    meta_df = pd.read_csv(meta_path).set_index('image_id')
    df['height'] = df.image_id.apply(lambda x: meta_df.loc[x, 'rows'])
    df['width'] = df.image_id.apply(lambda x: meta_df.loc[x, 'columns'])
    
    class2id = df[['class_name', 'class_id']].groupby('class_name').mean().to_dict()['class_id']
    id2class = {v:k for k,v in class2id.items()}
    
    image_ids = df.image_id.unique()

    l = []
    for image_id in tqdm(image_ids):
        tmp = df[df.image_id == image_id].copy()
        l.append(get_fused_boxes(image_id, tmp, conf_col_name='rad_id'))


    new = pd.concat(l).reset_index(drop=True)
    new['class_name'] = new.class_id.apply(lambda x: id2class[x])
    new = new[['image_id', 'class_name', 'class_id', 'rad_id', 'x_min', 'y_min',
           'x_max', 'y_max', 'conf']]
    
    new.to_csv(output_path, index=False)
    
    return new

In [11]:
INPUT_PATH = '/home/semyon/projects/vinbigdata/eda/dfs/result.csv'
OUTPUT_PATH = INPUT_PATH + 'weighted_boxes_fusion_iou-0.20.csv'

In [12]:
df = pd.read_csv(INPUT_PATH)

In [13]:
meta_path = '/home/semyon/data/VinBigData/train_meta.csv'
meta_df = pd.read_csv(meta_path).set_index('image_id')

df['height'] = df.image_id.apply(lambda x: meta_df.loc[x, 'rows'])
df['width'] = df.image_id.apply(lambda x: meta_df.loc[x, 'columns'])

In [15]:
df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,height,width
0,1c32170b4af4ce1a3030eb8167753b06,Aortic enlargement,-1,0.726738,1252,765,1474,976,3072,2540
1,1c32170b4af4ce1a3030eb8167753b06,Aortic enlargement,-1,0.325032,1207,753,1480,1099,3072,2540
2,1c32170b4af4ce1a3030eb8167753b06,Calcification,-1,0.078940,2042,1805,2146,1904,3072,2540
3,1c32170b4af4ce1a3030eb8167753b06,Cardiomegaly,-1,0.521538,912,1553,1931,2165,3072,2540
4,1c32170b4af4ce1a3030eb8167753b06,Cardiomegaly,-1,0.119591,224,2222,1502,2639,3072,2540
...,...,...,...,...,...,...,...,...,...,...
31614,1224f07d895107573588225f692e94f9,Pleural thickening,-1,0.062105,549,396,810,493,2264,2040
31615,1224f07d895107573588225f692e94f9,Pleural thickening,-1,0.061138,1749,1765,1824,1892,2264,2040
31616,1224f07d895107573588225f692e94f9,Pleural thickening,-1,0.059556,1373,435,1560,581,2264,2040
31617,1224f07d895107573588225f692e94f9,Pleural thickening,-1,0.058245,1191,392,1380,431,2264,2040


In [16]:
class2id = df[['class_name', 'class_id']].groupby('class_name').mean().to_dict()['class_id']
id2class = {v:k for k,v in class2id.items()}

In [4]:
df = pd.read_csv('/home/semyon/data/VinBigData/train.csv')
class2id = df[['class_name', 'class_id']].groupby('class_name').mean().to_dict()['class_id']
id2class = {v:k for k,v in class2id.items()}

In [5]:
class2id

{'Aortic enlargement': 0,
 'Atelectasis': 1,
 'Calcification': 2,
 'Cardiomegaly': 3,
 'Consolidation': 4,
 'ILD': 5,
 'Infiltration': 6,
 'Lung Opacity': 7,
 'No finding': 14,
 'Nodule/Mass': 8,
 'Other lesion': 9,
 'Pleural effusion': 10,
 'Pleural thickening': 11,
 'Pneumothorax': 12,
 'Pulmonary fibrosis': 13}

In [40]:
def get_fused_boxes(image_id, records, conf_col_name=None, iou_thr = 0.2, skip_box_thr = 0, only_one=False):

    all_rad_ids = records.groupby('image_id')['rad_id'].agg(lambda x: ' '.join([str(i) for i in np.unique(x)])).iloc[0]

    if records.groupby('image_id').mean()['class_id'].values[0] == 14:
        tmp = records[['image_id', 'class_name', 'class_id', 'rad_id', 'x_min', 'y_min', 'x_max', 'y_max']].copy()
        tmp = tmp.iloc[:1]
        tmp['rad_id'] = all_rad_ids
        return tmp
    
    boxes = records[['x_min', 'y_min', 'x_max', 'y_max']].values
    pix_multiplier = pd.DataFrame([records.width,records.height,records.width,records.height]).T
    boxes = [(boxes/(pix_multiplier)).values.tolist()]
    labels = [records["class_id"].tolist()]
    scores = [[1]*len(records)]
    if conf_col_name is not None:
        scores = [records[conf_col_name].tolist()]
    weights = [1]

    # If we demand only one of the label per image, we set iou threshold to 0
    if only_one:
        boxes, scores, labels = weighted_boxes_fusion(boxes, scores, labels, weights=weights, iou_thr=0, skip_box_thr=skip_box_thr)
    else:
        boxes, scores, labels = weighted_boxes_fusion(boxes, scores, labels, weights=weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)
    boxes = boxes * pix_multiplier.iloc[:len(boxes),:]
    boxes.columns = ['x_min', 'y_min', 'x_max', 'y_max']
    boxes['class_id'] = labels.astype(int)
    boxes['image_id'] = image_id
    boxes['rad_id'] = all_rad_ids
    boxes['conf'] = scores
    if conf_col_name == 'rad_id':
        boxes['rad_id'] = scores
    return boxes

In [42]:
image_ids = df.image_id.unique()

%%time
l = []
for image_id in tqdm(image_ids):
    tmp = df[df.image_id == image_id].copy()
    l.append(get_fused_boxes(image_id, tmp, conf_col_name='rad_id'))
    

new = pd.concat(l).reset_index(drop=True)

new['class_name'] = new.class_id.apply(lambda x: id2class[x])

new = new[['image_id', 'class_name', 'class_id', 'rad_id', 'x_min', 'y_min',
       'x_max', 'y_max', 'conf']]

In [46]:
new

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,conf
0,1c32170b4af4ce1a3030eb8167753b06,Pulmonary fibrosis,-1,0.525885,1238.093563,761.291702,1475.854380,1014.011078,0.525885
1,1c32170b4af4ce1a3030eb8167753b06,Pulmonary fibrosis,-1,0.521538,912.000000,1553.000000,1931.000000,2165.000000,0.521538
2,1c32170b4af4ce1a3030eb8167753b06,Pulmonary fibrosis,-1,0.119591,224.000000,2222.000000,1502.000000,2639.000000,0.119591
3,1c32170b4af4ce1a3030eb8167753b06,Pulmonary fibrosis,-1,0.111626,1823.000000,906.000000,1927.000000,1003.000000,0.111626
4,1c32170b4af4ce1a3030eb8167753b06,Pulmonary fibrosis,-1,0.110670,948.000000,749.000000,1053.000000,848.000000,0.110670
...,...,...,...,...,...,...,...,...,...
10518,1224f07d895107573588225f692e94f9,Pulmonary fibrosis,-1,0.072000,476.019153,403.186092,734.733925,554.128322,0.072000
10519,1224f07d895107573588225f692e94f9,Pulmonary fibrosis,-1,0.063798,1178.592546,375.242171,1433.725326,462.578149,0.063798
10520,1224f07d895107573588225f692e94f9,Pulmonary fibrosis,-1,0.062043,1329.316900,413.158420,1588.602097,596.601189,0.062043
10521,1224f07d895107573588225f692e94f9,Pulmonary fibrosis,-1,0.061771,1519.000000,701.000000,1578.000000,757.000000,0.061771


In [12]:
# new.to_csv(OUTPUT_PATH, index=False)

In [13]:
new

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,000434271f63a053c4128a0ba6352c7f,No finding,14,R2 R3 R6,,,,
1,00053190460d56c53cc3e57321387478,No finding,14,R11 R2 R7,,,,
2,0005e8e3701dfb1dd93d53e2ff537b6e,Consolidation,4,R10 R8 R9,932.000000,567.000000,1197.000000,896.000000
3,0005e8e3701dfb1dd93d53e2ff537b6e,Infiltration,6,R10 R8 R9,900.000000,587.000000,1205.000000,888.000000
4,0005e8e3701dfb1dd93d53e2ff537b6e,Nodule/Mass,8,R10 R8 R9,932.000000,567.000000,1197.000000,896.000000
...,...,...,...,...,...,...,...,...
35433,ffe6f9fe648a7ec29a50feb92d6c15a4,Cardiomegaly,3,R10 R8 R9,1144.666603,1832.000033,2302.999852,2359.666682
35434,ffea246f04196af602c7dc123e5e48fc,No finding,14,R13 R4 R6,,,,
35435,ffeffc54594debf3716d6fcd2402a99f,Aortic enlargement,0,R10 R8 R9,1254.999926,755.999973,1566.999964,1155.333332
35436,fff0f82159f9083f3dd1f8967fc54f6a,No finding,14,R14 R8 R9,,,,
