# Postprocess predictions and create a submission

In [1]:
#@title Submission notes
submission_message = "WBF preproc, YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier on 1024 res and 0.8 thresh " #@param {type:"string"}

In [2]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks/kaggle
    from setup_colab import setup_colab_for_kaggle, INPUT_FOLDER, WORK_FOLDER
    setup_colab_for_kaggle(check_env=False, local_working=True)
except:
    print("Not in Colab")

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/kaggle
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/working', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/output', '/kaggle/working', '/kaggle/input']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/cassava-model', '/kaggle/input/cassava-leaf-disease-classification', '/kaggle/input/googlebitemperedloss', '/kaggle/input/vbdyolo', '/kaggle/input/.ipynb_checkpoints', '/kaggle/input/vinbigdata', '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection', '/kaggle/input/vinbigdata-chest-xray-original-png']
Content of Kaggle data subdir (/kaggle/output): ['/kaggle/output/vbdyolo_out_1_300epochs', '/

In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

INPUT_FOLDER_ORIGINAL_PNG = INPUT_FOLDER / "vinbigdata-chest-xray-original-png"
INPUT_FOLDER_YOLO_OUT = WORK_FOLDER / "vbdyolo-out"
INPUT_FOLDER_BINARY = WORK_FOLDER / "vbdbinary"

## Get data from Kaggle

In [4]:
# Version notes: WBF preproc, YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15
!kaggle datasets download "witalia/vbdyolo-out-newest/version/6" -p {INPUT_FOLDER_YOLO_OUT} --unzip --force

Downloading vbdyolo-out-newest.zip to /kaggle/working/vbdyolo-out
 93% 144M/155M [00:01<00:00, 133MB/s]
100% 155M/155M [00:01<00:00, 145MB/s]


In [5]:
# Version notes: EffNet, 1024 resolution, threshold 0.8
!kaggle datasets download "witalia/vbdbinary/version/4" -p {INPUT_FOLDER_BINARY} --unzip --force

Downloading vbdbinary.zip to /kaggle/working/vbdbinary
 87% 90.0M/104M [00:00<00:00, 92.6MB/s]
100% 104M/104M [00:00<00:00, 142MB/s]  


## Process YOLO output

In [6]:
def read_prediction_labels(filename: Path, image_w: int, image_h: int):
    if not filename.exists():
        return "14 1 0 0 1 1"

    labels: pd.DataFrame = pd.read_csv(filename, delimiter=" ", header=None)
    labels.columns = ["class_id", "x_centre", "y_centre", "bw", "bh", "conf"]

    # Convert YOLO format (x_centre, y_centre, bw, bh) to competition format (x_min, y_min, x_max, y_max)
    labels["x_min"] = labels["x_centre"] - labels["bw"] / 2
    labels["y_min"] = labels["y_centre"] - labels["bh"] / 2
    labels["x_max"] = labels["x_centre"] + labels["bw"] / 2
    labels["y_max"] = labels["y_centre"] + labels["bh"] / 2
    labels = labels.drop(columns=["x_centre", "y_centre", "bw", "bh"])
    # After dropping, conf column should become the second one.
    assert(labels.columns.to_list() == ["class_id", "conf", "x_min", "y_min", "x_max", "y_max"])

    # Scale coordinates to image's size. Clip to make sure it's not out of bounds of the image.
    labels[["x_min", "x_max"]] = (labels[["x_min", "x_max"]] * image_w).round().astype(np.int32).clip(0, image_w - 1)
    labels[["y_min", "y_max"]] = (labels[["y_min", "y_max"]] * image_h).round().astype(np.int32).clip(0, image_h - 1)

    # Convert all rows to one prediction string
    return " ".join(labels.to_string(header=False, index=False).split())

In [7]:
results_yolo_df = pd.DataFrame(columns=["image_id", "PredictionString"])

test_metadata = pd.read_csv(INPUT_FOLDER_ORIGINAL_PNG / "test_meta.csv")
test_metadata = test_metadata.set_index("image_id").to_dict("index")

for image_id, image_dims in tqdm(test_metadata.items(), total=len(test_metadata)):
    # NOTE: dim0 and dim1 are reversed: y-axis, x-axis!
    prediction_str = read_prediction_labels(
        INPUT_FOLDER_YOLO_OUT / "labels_pred" / f"{image_id}.txt", image_dims["dim1"], image_dims["dim0"]
    )
    results_yolo_df = results_yolo_df.append({"image_id": image_id, "PredictionString": prediction_str}, ignore_index=True)

results_yolo_df.sample(10)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




Unnamed: 0,image_id,PredictionString
2100,b4fe965a759d442827c1b2a9dd1cf07c,0 0.826172 1436 744 1690 1031
2118,b681e8429d0e437034518de0ae6f6468,11 0.409180 518 282 888 399 0 0.881348 1055 53...
1505,8408eea460f0a9a2f9aec40da98917cb,0 0.864746 999 750 1266 1009 3 0.899902 685 14...
1856,a1adf6de567308fd21b771f24d1ad721,3 0.841797 943 1383 1989 1704 0 0.862793 1392 ...
1993,ad64176ee6a7c2be0275dca89c1087b8,0 0.888672 1332 579 1620 942 3 0.898438 1062 1...
1035,5d63f31cc8e21664c69bd7613b1b76e4,5 0.186401 362 1082 736 1602 6 0.272461 366 10...
1953,a9c0ae6876251773ec5aeacde33d8259,14 1 0 0 1 1
402,262f42441f1d7479e741f624becd54cb,14 1 0 0 1 1
46,047815a469503dfe21da4a12e2514a15,5 0.22876 7 310 1042 1804
2795,eeca313945bafd566ba7fe6a22083c8f,13 0.283203 392 471 510 562 11 0.336426 1140 3...


## Merge with Binary classifier output

In [8]:
results_binary_df = pd.read_csv(INPUT_FOLDER_BINARY / "prediction.csv")
display(results_binary_df.head())

results_df = results_yolo_df.merge(results_binary_df, on="image_id")
results_df.loc[results_df["class_name"] == "normal", "PredictionString"] = "14 1 0 0 1 1"
results_df = results_df.drop(columns=["class_name"])

results_df.to_csv(WORK_FOLDER / "submission.csv", index=False)
display(results_df.head())

Unnamed: 0,image_id,class_name
0,d0615f853a7deeec90f8b1bf30269fcc,normal
1,6637c08da9b3bce162e3aa689da14574,abnormal
2,b65467fc097115261fe11d2a99ccb5cd,abnormal
3,7742abe5bd817ba643d38fb517e35b24,normal
4,2790d36fa3cd4d3fa05229d1c693d499,abnormal


Unnamed: 0,image_id,PredictionString
0,002a34c58c5b758217ed1f584ccbcfe9,14 1 0 0 1 1
1,004f33259ee4aef671c2b95d54e4be68,0 0.883301 1256 588 1529 914
2,008bdde2af2462e86fd373a445d0f4cd,0 0.877930 1436 827 1741 1195 3 0.898926 1100 ...
3,009bc039326338823ca3aa84381f17f1,14 1 0 0 1 1
4,00a2145de1886cb9eb88869c85d74080,0 0.821777 1120 713 1354 955 3 0.927246 776 12...


## Submit to Kaggle

In [10]:
!kaggle competitions submit \
    vinbigdata-chest-xray-abnormalities-detection \
    -f {WORK_FOLDER}/submission.csv \
    -m "{submission_message}"
!sleep 10
!kaggle competitions submissions vinbigdata-chest-xray-abnormalities-detection

100% 222k/222k [00:00<00:00, 863kB/s]
fileName               date                 description                                                                                                                                   status    publicScore  privateScore  
---------------------  -------------------  --------------------------------------------------------------------------------------------------------------------------------------------  --------  -----------  ------------  
submission.csv         2021-03-18 00:00:04  WBF preproc, YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier on 1024 res and 0.8 thresh   complete  0.147        None          
submission.csv         2021-03-17 16:17:50  YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier on 1024 res and 0.8 thresh                complete  0.107        None          
submission.csv         2021-03-17 16:03:45  YOLOv5x, 50 epochs, ra