# Postprocess predictions and create a submission

In [1]:
#@title Submission notes
submission_message = "WBF preproc, YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier on 512 res and 0.8 thresh " #@param {type:"string"}

In [2]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks/kaggle
    from setup_colab import setup_colab_for_kaggle, INPUT_FOLDER, WORK_FOLDER
    setup_colab_for_kaggle(check_env=False, local_working=True)
except:
    print("Not in Colab")

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/kaggle
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/working', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/output', '/kaggle/working', '/kaggle/input']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/cassava-model', '/kaggle/input/cassava-leaf-disease-classification', '/kaggle/input/googlebitemperedloss', '/kaggle/input/vbdyolo', '/kaggle/input/.ipynb_checkpoints', '/kaggle/input/vinbigdata', '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection', '/kaggle/input/vinbigdata-chest-xray-original-png']
Content of Kaggle data subdir (/kaggle/output): ['/kaggle/output/vbdyolo_out_1_300epochs', '/

In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

INPUT_FOLDER_ORIGINAL_PNG = INPUT_FOLDER / "vinbigdata-chest-xray-original-png"
INPUT_FOLDER_YOLO_OUT = WORK_FOLDER / "vbdyolo-out"
INPUT_FOLDER_BINARY = WORK_FOLDER / "vbdbinary"

## Get data from Kaggle

In [4]:
# Version notes: WBF preproc, YOLOv5x, 50 epochs, 20% valid split, 1024 size, IOU 0.35, conf 0.15
!kaggle datasets download "witalia/vbdyolo-out-newest/version/6" -p {INPUT_FOLDER_YOLO_OUT} --unzip --force

Downloading vbdyolo-out-newest.zip to /kaggle/working/vbdyolo-out
 91% 142M/155M [00:01<00:00, 99.2MB/s]
100% 155M/155M [00:01<00:00, 107MB/s] 


In [5]:
# Version notes: EffNet, 1024 resolution, threshold 0.8
!kaggle datasets download "witalia/vbdbinary/version/4" -p {INPUT_FOLDER_BINARY} --unzip --force

Downloading vbdbinary.zip to /kaggle/working/vbdbinary
 76% 69.0M/90.6M [00:00<00:00, 72.3MB/s]
100% 90.6M/90.6M [00:00<00:00, 124MB/s] 


## Process YOLO output

In [6]:
def read_prediction_labels(filename: Path, image_w: int, image_h: int):
    if not filename.exists():
        return "14 1 0 0 1 1"

    labels: pd.DataFrame = pd.read_csv(filename, delimiter=" ", header=None)
    labels.columns = ["class_id", "x_centre", "y_centre", "bw", "bh", "conf"]

    # Convert YOLO format (x_centre, y_centre, bw, bh) to competition format (x_min, y_min, x_max, y_max)
    labels["x_min"] = labels["x_centre"] - labels["bw"] / 2
    labels["y_min"] = labels["y_centre"] - labels["bh"] / 2
    labels["x_max"] = labels["x_centre"] + labels["bw"] / 2
    labels["y_max"] = labels["y_centre"] + labels["bh"] / 2
    labels = labels.drop(columns=["x_centre", "y_centre", "bw", "bh"])
    # After dropping, conf column should become the second one.
    assert(labels.columns.to_list() == ["class_id", "conf", "x_min", "y_min", "x_max", "y_max"])

    # Scale coordinates to image's size. Clip to make sure it's not out of bounds of the image.
    labels[["x_min", "x_max"]] = (labels[["x_min", "x_max"]] * image_w).round().astype(np.int32).clip(0, image_w - 1)
    labels[["y_min", "y_max"]] = (labels[["y_min", "y_max"]] * image_h).round().astype(np.int32).clip(0, image_h - 1)

    # Convert all rows to one prediction string
    return " ".join(labels.to_string(header=False, index=False).split())

In [7]:
results_yolo_df = pd.DataFrame(columns=["image_id", "PredictionString"])

test_metadata = pd.read_csv(INPUT_FOLDER_ORIGINAL_PNG / "test_meta.csv")
test_metadata = test_metadata.set_index("image_id").to_dict("index")

for image_id, image_dims in tqdm(test_metadata.items(), total=len(test_metadata)):
    # NOTE: dim0 and dim1 are reversed: y-axis, x-axis!
    prediction_str = read_prediction_labels(
        INPUT_FOLDER_YOLO_OUT / "labels_pred" / f"{image_id}.txt", image_dims["dim1"], image_dims["dim0"]
    )
    results_yolo_df = results_yolo_df.append({"image_id": image_id, "PredictionString": prediction_str}, ignore_index=True)

results_yolo_df.sample(10)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




Unnamed: 0,image_id,PredictionString
598,37e7deb1b1895f2d2648781e426a3460,5 0.220825 24 166 917 1855 13 0.527832 1599 15...
359,21a3312b6e4ab747c09c526b77cb0716,13 0.239990 1527 647 1850 999 13 0.606445 1669...
2314,c70a517fbbd3b5dee67d304ba3e81cd8,0 0.238037 1275 831 1584 1214
2456,d242d48dd1067b27d7d13b0f7ce02640,0 0.190308 974 712 1180 947 11 0.398193 502 26...
400,25f92d35866a758f4175812fe3f7f47c,0 0.259766 1422 645 1727 880 13 0.789062 1944 ...
165,0e48bfb79ac12bbcce9b66cc9f14544c,5 0.242432 378 1372 623 1794 11 0.257324 1139 ...
2045,b0b42b663139625afc6e29526ae1ecfe,13 0.163452 888 642 1298 1236 2 0.233154 1649 ...
980,594fb89413087c080ddd2c97a3f6acf0,3 0.654297 1181 1659 2351 1974 0 0.896484 1577...
1925,a7b8a9e921ea304362918523cc3529ef,10 0.612793 1186 1280 1839 2049 7 0.671387 128...
1387,79054bd52e5f6ebcda94fdc54e2fada5,14 1 0 0 1 1


## Merge with Binary classifier output

In [8]:
results_binary_df = pd.read_csv(INPUT_FOLDER_BINARY / "prediction.csv")
display(results_binary_df.head())

results_df = results_yolo_df.merge(results_binary_df, on="image_id")
results_df.loc[results_df["class_name"] == "normal", "PredictionString"] = "14 1 0 0 1 1"
results_df = results_df.drop(columns=["class_name"])

results_df.to_csv(WORK_FOLDER / "submission.csv", index=False)
display(results_df.head())

Unnamed: 0,image_id,class_name
0,8408eea460f0a9a2f9aec40da98917cb,normal
1,436088e347319cbffc494a8d24005299,normal
2,ec9bb4e880f135642dbb5f38d4a6d16b,abnormal
3,f31c9526ebdf691e426925e8da3af65c,abnormal
4,ea10cb0d473159762db63e0493a079b3,normal


Unnamed: 0,image_id,PredictionString
0,002a34c58c5b758217ed1f584ccbcfe9,14 1 0 0 1 1
1,004f33259ee4aef671c2b95d54e4be68,0 0.883301 1256 588 1529 914
2,008bdde2af2462e86fd373a445d0f4cd,0 0.877930 1436 827 1741 1195 3 0.898926 1100 ...
3,009bc039326338823ca3aa84381f17f1,5 0.242554 204 596 709 1305 7 0.263916 1468 10...
4,00a2145de1886cb9eb88869c85d74080,0 0.821777 1120 713 1354 955 3 0.927246 776 12...


## Submit to Kaggle

In [9]:
!kaggle competitions submit \
    vinbigdata-chest-xray-abnormalities-detection \
    -f {WORK_FOLDER}/submission.csv \
    -m "{submission_message}"
!sleep 10
!kaggle competitions submissions vinbigdata-chest-xray-abnormalities-detection

100% 237k/237k [00:01<00:00, 129kB/s]
fileName               date                 description                                                                                                                                   status    publicScore  privateScore  
---------------------  -------------------  --------------------------------------------------------------------------------------------------------------------------------------------  --------  -----------  ------------  
submission.csv         2021-03-18 03:53:10  WBF preproc, YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier on 512 res and 0.8 thresh    complete  0.141        None          
submission.csv         2021-03-18 00:00:04  WBF preproc, YOLOv5x, 50 epochs, random rad, 20% valid split, 1024 size, IOU 0.35, conf 0.15 + binary classifier on 1024 res and 0.8 thresh   complete  0.147        None          
submission.csv         2021-03-17 16:17:50  YOLOv5x, 50 epochs, ra