# Postprocess predictions and create a submission

In [2]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks/kaggle
    from setup_colab import setup_colab_for_kaggle, INPUT_FOLDER, WORK_FOLDER
    setup_colab_for_kaggle(check_env=False, local_working=True)
except:
    print("Not in Colab")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/kaggle
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/working', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/output', '/kaggle/input', '/kaggle/working']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/cassava-model', '/kaggle/input/cassava-leaf-disease-classification', '/kaggle/input/googlebitemperedloss', '/kaggle/input/vbdyolo', '/kaggle/input/.ipynb_checkpoints', '/kaggle/input/vinbigdata', '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection', '/kaggle/input/vinbigdata-chest-xray-origi

In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

INPUT_FOLDER_ORIGINAL_PNG = INPUT_FOLDER / "vinbigdata-chest-xray-original-png"
INPUT_FOLDER_YOLO_OUT = WORK_FOLDER / "vbdyolo-out"

## Get data from Kaggle

In [6]:
# Version notes: initial release (YOLOv5s, random rad)
!kaggle datasets download "witalia/vbdyolo-out/version/1" -p {INPUT_FOLDER_YOLO_OUT} --unzip --force

Downloading vbdyolo-out.zip to /kaggle/working/vbdyolo-out
  0% 0.00/13.0M [00:00<?, ?B/s]
100% 13.0M/13.0M [00:00<00:00, 119MB/s]


## Process YOLO output

In [7]:
def read_prediction_labels(filename: Path, image_w: int, image_h: int):
    if not filename.exists():
        return "14 1 0 0 1 1"

    labels: pd.DataFrame = pd.read_csv(filename, delimiter=" ", header=None)
    labels.columns = ["class_id", "x_centre", "y_centre", "bw", "bh", "conf"]

    # Convert YOLO format (x_centre, y_centre, bw, bh) to competition format (x_min, y_min, x_max, y_max)
    labels["x_min"] = labels["x_centre"] - labels["bw"] / 2
    labels["y_min"] = labels["y_centre"] - labels["bh"] / 2
    labels["x_max"] = labels["x_centre"] + labels["bw"] / 2
    labels["y_max"] = labels["y_centre"] + labels["bh"] / 2
    labels = labels.drop(columns=["x_centre", "y_centre", "bw", "bh"])
    # After dropping, conf column should become the second one.
    assert(labels.columns.to_list() == ["class_id", "conf", "x_min", "y_min", "x_max", "y_max"])

    # Scale coordinates to image's size. Clip to make sure it's not out of bounds of the image.
    labels[["x_min", "x_max"]] = (labels[["x_min", "x_max"]] * image_w).round().astype(np.int32).clip(0, image_w - 1)
    labels[["y_min", "y_max"]] = (labels[["y_min", "y_max"]] * image_h).round().astype(np.int32).clip(0, image_h - 1)

    # Convert all rows to one prediction string
    return " ".join(labels.to_string(header=False, index=False).split())

In [8]:
results_df = pd.DataFrame(columns=["image_id", "PredictionString"])

test_metadata = pd.read_csv(INPUT_FOLDER_ORIGINAL_PNG / "test_meta.csv")
test_metadata = test_metadata.set_index("image_id").to_dict("index")

for image_id, image_dims in tqdm(test_metadata.items(), total=len(test_metadata)):
    prediction_str = read_prediction_labels(
        INPUT_FOLDER_YOLO_OUT / "labels_pred" / f"{image_id}.txt", image_dims["dim0"], image_dims["dim1"]
    )
    results_df = results_df.append({"image_id": image_id, "PredictionString": prediction_str}, ignore_index=True)

results_df.to_csv(WORK_FOLDER / "submission.csv", index=False)
display(results_df.sample(10))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




Unnamed: 0,image_id,PredictionString
2498,d5d13d7d803507aed653e01a183e0089,0 0.796387 1106 845 1310 1126
1587,89f9bf838081221588361f1da1e9a81a,3 0.613770 803 1487 1861 1866 0 0.812012 1119 ...
2855,f40d9ec43c1455414421a190344498a6,14 1 0 0 1 1
2322,c76cf9daa7ac9cb5e2b839223a25e3e8,0 0.558105 1040 668 1299 960
949,56e3aa531cd2b4c2a597657ce69bea4f,0 0.804199 1142 792 1524 1179 3 0.851562 748 1...
476,2bc8d5ac6ce8651f95b07a2753dcb04b,14 1 0 0 1 1
724,421efe81b01f6724524502d4337d9435,3 0.681152 900 1364 1892 1930
1714,94699ec9440ff2d98c16238825994312,14 1 0 0 1 1
1873,a2d27708156f3e692475bf6088380dc9,10 0.746582 1833 1524 2177 2122
1447,7e198e152ead58293763b80301b53ea7,14 1 0 0 1 1


## Submit to Kaggle

In [25]:
submission_message = "YOLOv5s postprocessed" #@param
!kaggle competitions submit \
    vinbigdata-chest-xray-abnormalities-detection \
    -f {WORK_FOLDER}/submission.csv \
    -m "{submission_message}"
!sleep 10
!kaggle competitions submissions vinbigdata-chest-xray-abnormalities-detection

  0% 0.00/179k [00:00<?, ?B/s]100% 179k/179k [00:00<00:00, 854kB/s]
fileName               date                 description                                          status    publicScore  privateScore  
---------------------  -------------------  ---------------------------------------------------  --------  -----------  ------------  
submission.csv         2021-03-13 18:26:54  YOLOv5s postprocessed                                complete  0.075        None          
submission.csv         2021-03-13 18:26:27  YOLOv5s postprocessed                                complete  0.075        None          
submission.csv         2021-03-13 01:48:25  YOLOv5 XL 300 epochs, all labels included, conf 0.3  complete  0.052        None          
submission.csv         2021-03-13 00:13:24  YOLOv5 XL 300 epochs, all labels included, conf 0.3  complete  0.052        None          
submission.csv         2021-03-13 00:07:10  YOLOv5 XL 300 epochs, all labels included            complete  0.052        