In [None]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks/kaggle
    from setup_colab import setup_colab_for_kaggle
    setup_colab_for_kaggle(check_env=False, local_working=True)
except:
    print("Not in Colab")

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/kaggle
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/working', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/input', '/kaggle/output', '/kaggle/working']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/cassava-model', '/kaggle/input/cassava-leaf-disease-classification', '/kaggle/input/googlebitemperedloss', '/kaggle/input/vbdyolo', '/kaggle/input/.ipynb_checkpoints', '/kaggle/input/vinbigdata', '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection', '/kaggle/input/vinbigdata-chest-xray-original-png']
Content of Kaggle data subdir (/kaggle/output): ['/kaggle/output/vbdyolo_out_1_300epochs', '/

In [34]:
from pathlib import Path
import shutil

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

np.random.seed(0)

INPUT_FOLDER = Path("/kaggle/input/vinbigdata-chest-xray-abnormalities-detection")
INPUT_FOLDER_PNG = Path("/kaggle/input/vinbigdata")
OUTPUT_FOLDER = Path("/kaggle/output/vbdyolo")

Add metadata to train dataframe (dimensions of the original image), drop class 14 - No finding, and get coords in YOLO format - (x_centre, y_center) (bw, bh).

In [None]:
def convert_to_yolo(bbox_df):
    # Normalize the coordinates.
    bbox_df[["x_min", "x_max"]] = bbox_df[["x_min", "x_max"]].div(bbox_df["dim0"], axis=0)
    bbox_df[["y_min", "y_max"]] = bbox_df[["y_min", "y_max"]].div(bbox_df["dim1"], axis=0)
    # Get YOLO coordinates
    bbox_df["x_centre"] = (bbox_df["x_min"] + bbox_df["x_max"]) / 2
    bbox_df["y_centre"] = (bbox_df["y_min"] + bbox_df["y_max"]) / 2
    bbox_df["bw"] = bbox_df["x_max"] - bbox_df["x_min"]
    bbox_df["bh"] = bbox_df["y_max"] - bbox_df["y_min"]
    return bbox_df

train_df = pd.read_csv(Path(INPUT_FOLDER, "train.csv"))
train_meta_df = pd.read_csv(Path(INPUT_FOLDER_PNG, "train_meta.csv"))
train_df = train_df.merge(train_meta_df, on="image_id")
train_df = train_df[train_df["class_name"] != "No finding"].reset_index(drop=True)
train_df = convert_to_yolo(train_df)
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,dim0,dim1,x_centre,y_centre,bw,bh
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,0.295805,0.661058,0.70762,0.880288,2336,2080,0.501712,0.770673,0.411815,0.219231
1,9a5094b2563a1ef3ff50dc5c7ff71345,Pleural effusion,10,R9,0.765839,0.83125,0.802654,0.957692,2336,2080,0.784247,0.894471,0.036815,0.126442
2,9a5094b2563a1ef3ff50dc5c7ff71345,Pleural thickening,11,R9,0.765839,0.83125,0.802654,0.957692,2336,2080,0.784247,0.894471,0.036815,0.126442
3,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R9,0.296233,0.661058,0.709332,0.864904,2336,2080,0.502783,0.762981,0.413099,0.203846
4,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R8,0.294949,0.63125,0.713185,0.847596,2336,2080,0.504067,0.739423,0.418236,0.216346


Split into Train/Validation datasets. Use only 1 fold for now.

In [None]:
from sklearn.model_selection import GroupShuffleSplit

gss: GroupShuffleSplit = GroupShuffleSplit(n_splits=1, test_size=.1, random_state=32)
train_idx, valid_idx = next(gss.split(train_df, groups=train_df["image_id"].to_list()))
train_data, valid_data = train_df.iloc[train_idx], train_df.iloc[valid_idx]
len(train_data), len(valid_data)

(32520, 3576)

Start by using all of the available labels.

In [None]:
def reduce_bboxes_random_rad(image_labels):
    random_rad = np.random.choice(image_labels["rad_id"].unique())
    return image_labels[image_labels["rad_id"] == random_rad]


def get_yolo_labels_txt(image_labels):
    # # For now, select only labels from one random radiologist out of 3.
    # image_labels = reduce_bboxes_random_rad(image_labels)
    
    # Use all of the labels for now.
    return image_labels[["class_id", "x_centre", "y_centre", "bw", "bh"]].to_string(header=False, index=False)

In [36]:
# Remove dataset if there's anything already.
if OUTPUT_FOLDER.exists():
    shutil.rmtree(OUTPUT_FOLDER)

OUTPUT_FOLDER.mkdir()

for set_name, data in zip(["train", "valid"], [train_data, valid_data]):
    (OUTPUT_FOLDER / "images" / set_name).mkdir(parents=True)
    (OUTPUT_FOLDER / "labels" / set_name).mkdir(parents=True)

    for image_id, image_grouped_labels in tqdm(data.groupby("image_id"), total=data["image_id"].nunique()):
        image_file_name = f"{image_id}.png"
        shutil.copyfile(
            INPUT_FOLDER_PNG / "train" / image_file_name,
            OUTPUT_FOLDER / "images" / set_name / image_file_name,
        )
        with open(OUTPUT_FOLDER / "labels" / set_name / f"{image_id}.txt", "w") as f:
            f.write(get_yolo_labels_txt(image_grouped_labels))

HBox(children=(FloatProgress(value=0.0, max=3954.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=440.0), HTML(value='')))




In [38]:
# Just copy all the test files as well.
shutil.copytree(INPUT_FOLDER_PNG / "test", OUTPUT_FOLDER / "images" / "test")

PosixPath('/kaggle/output/vbdyolo/images/test')

Store configuration of the dataset.

In [52]:
# Use the pair (class_name, class_id) as index to get unique combinations
classes = train_df.set_index(["class_name", "class_id"]).index.unique().to_frame()
# We got it as multi-index, convert to columns, drop the old index, and sort.
classes = classes.reset_index(drop=True).sort_values(by="class_id").reset_index(drop=True)
classes

Unnamed: 0,class_name,class_id
0,Aortic enlargement,0
1,Atelectasis,1
2,Calcification,2
3,Cardiomegaly,3
4,Consolidation,4
5,ILD,5
6,Infiltration,6
7,Lung Opacity,7
8,Nodule/Mass,8
9,Other lesion,9


In [58]:
import yaml

class_names = classes["class_name"].values

dataset_config = {
    "nc": len(class_names),
    "names": list(class_names),
    "train": "/kaggle/output/vbdyolo/images/train",
    "val": "/kaggle/output/vbdyolo/images/valid"
}

with open(OUTPUT_FOLDER / "vbd-dataset.yaml", "w") as f:
    yaml.dump(dataset_config, f, default_flow_style=False)