# Postprocess predictions and create a submission

In [1]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks/kaggle
    from setup_colab import setup_colab_for_kaggle, INPUT_FOLDER, WORK_FOLDER
    setup_colab_for_kaggle(check_env=False, local_working=True)
except:
    print("Not in Colab")

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/kaggle
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/working', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/output', '/kaggle/input', '/kaggle/working']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/cassava-model', '/kaggle/input/cassava-leaf-disease-classification', '/kaggle/input/googlebitemperedloss', '/kaggle/input/vbdyolo', '/kaggle/input/.ipynb_checkpoints', '/kaggle/input/vinbigdata', '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection', '/kaggle/input/vinbigdata-chest-xray-original-png']
Content of Kaggle data subdir (/kaggle/output): ['/kaggle/output/vbdyolo_out_1_300epochs', '/

In [2]:
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

INPUT_FOLDER_ORIGINAL_PNG = INPUT_FOLDER / "vinbigdata-chest-xray-original-png"
INPUT_FOLDER_YOLO_OUT = WORK_FOLDER / "vbdyolo-out"
INPUT_FOLDER_BINARY = WORK_FOLDER / "vbdbinary"

## Get data from Kaggle

In [3]:
# Version notes: YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15
!kaggle datasets download "witalia/vbdyolo-out-newest/version/5" -p {INPUT_FOLDER_YOLO_OUT} --unzip --force

Downloading vbdyolo-out-newest.zip to /kaggle/working/vbdyolo-out
 94% 145M/155M [00:02<00:00, 84.4MB/s]
100% 155M/155M [00:02<00:00, 76.2MB/s]


In [4]:
# Version notes: initial release (EffNet)
!kaggle datasets download "witalia/vbdbinary/version/1" -p {INPUT_FOLDER_BINARY} --unzip --force

Downloading vbdbinary.zip to /kaggle/working/vbdbinary
 90% 81.0M/90.0M [00:02<00:00, 28.9MB/s]
100% 90.0M/90.0M [00:02<00:00, 42.4MB/s]


## Process YOLO output

In [5]:
def read_prediction_labels(filename: Path, image_w: int, image_h: int):
    if not filename.exists():
        return "14 1 0 0 1 1"

    labels: pd.DataFrame = pd.read_csv(filename, delimiter=" ", header=None)
    labels.columns = ["class_id", "x_centre", "y_centre", "bw", "bh", "conf"]

    # Convert YOLO format (x_centre, y_centre, bw, bh) to competition format (x_min, y_min, x_max, y_max)
    labels["x_min"] = labels["x_centre"] - labels["bw"] / 2
    labels["y_min"] = labels["y_centre"] - labels["bh"] / 2
    labels["x_max"] = labels["x_centre"] + labels["bw"] / 2
    labels["y_max"] = labels["y_centre"] + labels["bh"] / 2
    labels = labels.drop(columns=["x_centre", "y_centre", "bw", "bh"])
    # After dropping, conf column should become the second one.
    assert(labels.columns.to_list() == ["class_id", "conf", "x_min", "y_min", "x_max", "y_max"])

    # Scale coordinates to image's size. Clip to make sure it's not out of bounds of the image.
    labels[["x_min", "x_max"]] = (labels[["x_min", "x_max"]] * image_w).round().astype(np.int32).clip(0, image_w - 1)
    labels[["y_min", "y_max"]] = (labels[["y_min", "y_max"]] * image_h).round().astype(np.int32).clip(0, image_h - 1)

    # Convert all rows to one prediction string
    return " ".join(labels.to_string(header=False, index=False).split())

In [6]:
results_yolo_df = pd.DataFrame(columns=["image_id", "PredictionString"])

test_metadata = pd.read_csv(INPUT_FOLDER_ORIGINAL_PNG / "test_meta.csv")
test_metadata = test_metadata.set_index("image_id").to_dict("index")

for image_id, image_dims in tqdm(test_metadata.items(), total=len(test_metadata)):
    prediction_str = read_prediction_labels(
        INPUT_FOLDER_YOLO_OUT / "labels_pred" / f"{image_id}.txt", image_dims["dim0"], image_dims["dim1"]
    )
    results_yolo_df = results_yolo_df.append({"image_id": image_id, "PredictionString": prediction_str}, ignore_index=True)

results_yolo_df.sample(10)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




Unnamed: 0,image_id,PredictionString
1214,6ced1a5ca6625e9c00070e14a98cf816,14 1 0 0 1 1
2938,fbbbec8a08c3776eb788dfa24e96c116,11 0.170166 1002 492 1245 576 3 0.666504 1125 ...
2729,e9205f3a0c23ba8b256cb5e52d14985b,13 0.159546 2069 1241 2243 1300 9 0.239258 111...
881,509b0a6e32afb4094897a916b3c72486,14 1 0 0 1 1
2135,b7a6886c351cbdd8af911e2f813d9c65,14 1 0 0 1 1
409,26fa2f95e008965bec79eddf40220618,14 1 0 0 1 1
181,0f4fd9b975fc4cc80c28b9539769f8e6,3 0.270264 1152 1620 2280 1968
935,5559843d1f057bcaa0d9b5d56a9a3d46,0 0.630371 1184 708 1455 996
1927,a7c42a5d0723d41c389b35c2c3251f88,3 0.302002 1254 1081 2262 1471 0 0.310303 1477...
2679,e4e32ce0e061d700c0afda13faa45b1d,11 0.223755 507 2286 600 2400 13 0.397217 1122...


## Merge with Binary classifier output

In [7]:
results_binary_df = pd.read_csv(INPUT_FOLDER_BINARY / "prediction.csv")
display(results_binary_df.head())

results_df = results_yolo_df.merge(results_binary_df, on="image_id")
results_df.loc[results_df["class_name"] == "normal", "PredictionString"] = "14 1 0 0 1 1"
results_df = results_df.drop(columns=["class_name"])

results_df.to_csv(WORK_FOLDER / "submission.csv", index=False)
display(results_df.head())

Unnamed: 0,image_id,class_name
0,8bfad13f75648d94cdc8fc5f988f701f,normal
1,235d5a43e07233fa0117ed8a185c06bf,abnormal
2,bc247fbc4a986f2974c1d8b158de77bd,abnormal
3,93cdc1245a32efee2b5ec92abe175e8b,normal
4,f734d8823f31d805567d6480513003d2,normal


Unnamed: 0,image_id,PredictionString
0,002a34c58c5b758217ed1f584ccbcfe9,14 1 0 0 1 1
1,004f33259ee4aef671c2b95d54e4be68,14 1 0 0 1 1
2,008bdde2af2462e86fd373a445d0f4cd,3 0.438232 1142 1444 2028 1804 0 0.552246 1507...
3,009bc039326338823ca3aa84381f17f1,14 1 0 0 1 1
4,00a2145de1886cb9eb88869c85d74080,3 0.510254 724 1381 1750 1756


## Submit to Kaggle

In [9]:
submission_message = "YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier" #@param {type:"string"}
!kaggle competitions submit \
    vinbigdata-chest-xray-abnormalities-detection \
    -f {WORK_FOLDER}/submission.csv \
    -m "{submission_message}"
!sleep 10
!kaggle competitions submissions vinbigdata-chest-xray-abnormalities-detection

100% 165k/165k [00:05<00:00, 29.6kB/s]
fileName               date                 description                                                                                          status    publicScore  privateScore  
---------------------  -------------------  ---------------------------------------------------------------------------------------------------  --------  -----------  ------------  
submission.csv         2021-03-17 04:13:50  YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier  complete  0.105        None          
submission.csv         2021-03-17 04:13:18  YOLOv5s, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier  complete  0.105        None          
submission.csv         2021-03-16 22:39:27  YOLOv5s, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier  complete  0.092        None          
submission.csv         2021-03-16 22:21:54  YO