# Manage Kaggle datasets for VinBigData Chest X-Ray competition

In [1]:
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks/kaggle
    from setup_colab import setup_colab_for_kaggle
    setup_colab_for_kaggle(check_env=False, local_working=True)
except:
    print("Not in Colab")

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/kaggle
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/working', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/input', '/kaggle/output', '/kaggle/working']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/cassava-model', '/kaggle/input/cassava-leaf-disease-classification', '/kaggle/input/googlebitemperedloss', '/kaggle/input/vbdyolo', '/kaggle/input/.ipynb_checkpoints', '/kaggle/input/vinbigdata', '/kaggle/input/vinbigdata-chest-xray-abnormalities-detection', '/kaggle/input/vinbigdata-chest-xray-original-png']
Content of Kaggle data subdir (/kaggle/output): ['/kaggle/output/vbdyolo_out_1_300epochs', '/

In [2]:
import json
from pathlib import Path
import shutil

from kaggle.api.kaggle_api_extended import KaggleApi
kaggle = KaggleApi()
kaggle.authenticate()

In [3]:
def try_unzip(file_path):
    file_path = Path(file_path)
    zip_path = Path(f"{file_path}.zip")
    if zip_path.exists():
        !unzip -o {str(zip_path)} -d {str(zip_path.parent)}
        zip_path.unlink()

In [4]:
def ensure_metadata_present(dir_path, user_name, dataset_name):
    metadata_file = Path(dir_path, "dataset-metadata.json")

    if not metadata_file.exists():
        with open(metadata_file, "w") as f:
            json.dump({
                "title": dataset_name,
                "id": f"{user_name}/{dataset_name}",
                "licenses": [{ "name": "CC0-1.0" }]
            }, f, indent=4)

## Download

### Get the competition dataset
Without only a sample of DICOM data since it totals ~200 GB

In [None]:
%cd /kaggle/input

competition_files_include = ["train.csv", "sample_submission.csv"]
competition_name = "vinbigdata-chest-xray-abnormalities-detection"

all_files = kaggle.competitions_data_list_files(competition_name)
for set_name in ["train", "test"]:
    competition_files_include += [file["name"] for file in all_files if file["name"].startswith(set_name)][:10]

if Path(competition_name).exists():
    shutil.rmtree(competition_name)

for file_name in competition_files_include:
    out_file = Path(competition_name, file_name)
    kaggle.competition_download_file(competition_name, file_name, path=out_file.parent, quiet=True)
    try_unzip(out_file)

100%|██████████| 1.79M/1.79M [00:00<00:00, 55.3MB/s]

Downloading train.csv.zip to vinbigdata-chest-xray-abnormalities-detection

Archive:  vinbigdata-chest-xray-abnormalities-detection/train.csv.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train.csv  







100%|██████████| 135k/135k [00:00<00:00, 19.6MB/s]

Downloading sample_submission.csv to vinbigdata-chest-xray-abnormalities-detection




  0%|          | 0.00/4.74M [00:00<?, ?B/s]

Downloading 004d2bc2111d639f5e8441ced52d55cb.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train


100%|██████████| 4.74M/4.74M [00:00<00:00, 20.9MB/s]


Archive:  vinbigdata-chest-xray-abnormalities-detection/train/004d2bc2111d639f5e8441ced52d55cb.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/004d2bc2111d639f5e8441ced52d55cb.dicom  



100%|██████████| 9.31M/9.31M [00:00<00:00, 64.0MB/s]

Downloading 001d127bad87592efe45a5c7678f8b8d.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train






Archive:  vinbigdata-chest-xray-abnormalities-detection/train/001d127bad87592efe45a5c7678f8b8d.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/001d127bad87592efe45a5c7678f8b8d.dicom  


 36%|███▋      | 5.00M/13.7M [00:00<00:00, 39.7MB/s]

Downloading 000ae00eb3942d27e0b97903dd563a6e.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train


100%|██████████| 13.7M/13.7M [00:00<00:00, 59.5MB/s]



Archive:  vinbigdata-chest-xray-abnormalities-detection/train/000ae00eb3942d27e0b97903dd563a6e.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/000ae00eb3942d27e0b97903dd563a6e.dicom  


100%|██████████| 6.33M/6.33M [00:00<00:00, 51.6MB/s]

Downloading 0032c6091dc8f1b1245fc2f5f45458fa.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train






Archive:  vinbigdata-chest-xray-abnormalities-detection/train/0032c6091dc8f1b1245fc2f5f45458fa.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/0032c6091dc8f1b1245fc2f5f45458fa.dicom  


100%|██████████| 6.30M/6.30M [00:00<00:00, 82.5MB/s]

Downloading 000d68e42b71d3eac10ccc077aba07c1.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train

Archive:  vinbigdata-chest-xray-abnormalities-detection/train/000d68e42b71d3eac10ccc077aba07c1.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/000d68e42b71d3eac10ccc077aba07c1.dicom  







100%|██████████| 9.08M/9.08M [00:00<00:00, 63.0MB/s]

Downloading 000434271f63a053c4128a0ba6352c7f.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train






Archive:  vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/000434271f63a053c4128a0ba6352c7f.dicom  


100%|██████████| 4.63M/4.63M [00:00<00:00, 65.0MB/s]

Downloading 00053190460d56c53cc3e57321387478.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train

Archive:  vinbigdata-chest-xray-abnormalities-detection/train/00053190460d56c53cc3e57321387478.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/00053190460d56c53cc3e57321387478.dicom  







100%|██████████| 8.71M/8.71M [00:00<00:00, 55.8MB/s]

Downloading 0007d316f756b3fa0baea2ff514ce945.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train






Archive:  vinbigdata-chest-xray-abnormalities-detection/train/0007d316f756b3fa0baea2ff514ce945.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/0007d316f756b3fa0baea2ff514ce945.dicom  


100%|██████████| 9.58M/9.58M [00:00<00:00, 51.3MB/s]

Downloading 00291f7aff0123ea76a59998effef229.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train






Archive:  vinbigdata-chest-xray-abnormalities-detection/train/00291f7aff0123ea76a59998effef229.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/00291f7aff0123ea76a59998effef229.dicom  


100%|██████████| 4.19M/4.19M [00:00<00:00, 62.4MB/s]

Downloading 003cfe5ce5c0ec5163138eb3b740e328.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/train

Archive:  vinbigdata-chest-xray-abnormalities-detection/train/003cfe5ce5c0ec5163138eb3b740e328.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/train/003cfe5ce5c0ec5163138eb3b740e328.dicom  







100%|██████████| 9.49M/9.49M [00:00<00:00, 59.2MB/s]

Downloading 0168eb925aa6f28a78b16134792f5d0e.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test






Archive:  vinbigdata-chest-xray-abnormalities-detection/test/0168eb925aa6f28a78b16134792f5d0e.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/0168eb925aa6f28a78b16134792f5d0e.dicom  


100%|██████████| 8.09M/8.09M [00:00<00:00, 92.6MB/s]

Downloading 008bdde2af2462e86fd373a445d0f4cd.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test






Archive:  vinbigdata-chest-xray-abnormalities-detection/test/008bdde2af2462e86fd373a445d0f4cd.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/008bdde2af2462e86fd373a445d0f4cd.dicom  


 46%|████▌     | 6.00M/13.1M [00:00<00:00, 59.8MB/s]

Downloading 0171021638f9272a34a41feb84ed47a0.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test


100%|██████████| 13.1M/13.1M [00:00<00:00, 42.9MB/s]



Archive:  vinbigdata-chest-xray-abnormalities-detection/test/0171021638f9272a34a41feb84ed47a0.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/0171021638f9272a34a41feb84ed47a0.dicom  


100%|██████████| 5.88M/5.88M [00:00<00:00, 94.2MB/s]

Downloading 01ded16689539deb30d0981fafd18465.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test

Archive:  vinbigdata-chest-xray-abnormalities-detection/test/01ded16689539deb30d0981fafd18465.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/01ded16689539deb30d0981fafd18465.dicom  



100%|██████████| 6.84M/6.84M [00:00<00:00, 79.2MB/s]

Downloading 002a34c58c5b758217ed1f584ccbcfe9.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test






Archive:  vinbigdata-chest-xray-abnormalities-detection/test/002a34c58c5b758217ed1f584ccbcfe9.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/002a34c58c5b758217ed1f584ccbcfe9.dicom  


100%|██████████| 5.65M/5.65M [00:00<00:00, 71.6MB/s]

Downloading 01431a2618c0ace741e4e270a37e20b9.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test






Archive:  vinbigdata-chest-xray-abnormalities-detection/test/01431a2618c0ace741e4e270a37e20b9.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/01431a2618c0ace741e4e270a37e20b9.dicom  


100%|██████████| 5.60M/5.60M [00:00<00:00, 115MB/s]

Downloading 004f33259ee4aef671c2b95d54e4be68.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test

Archive:  vinbigdata-chest-xray-abnormalities-detection/test/004f33259ee4aef671c2b95d54e4be68.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/004f33259ee4aef671c2b95d54e4be68.dicom  



100%|██████████| 9.53M/9.53M [00:00<00:00, 66.9MB/s]

Downloading 02425334e92510da663eb913ad0632ea.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test






Archive:  vinbigdata-chest-xray-abnormalities-detection/test/02425334e92510da663eb913ad0632ea.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/02425334e92510da663eb913ad0632ea.dicom  


100%|██████████| 6.51M/6.51M [00:00<00:00, 92.9MB/s]

Downloading 013c169f9dad6f1f6485da961b9f7bf2.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test

Archive:  vinbigdata-chest-xray-abnormalities-detection/test/013c169f9dad6f1f6485da961b9f7bf2.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/013c169f9dad6f1f6485da961b9f7bf2.dicom  







100%|██████████| 9.42M/9.42M [00:00<00:00, 57.7MB/s]

Downloading 022a62478444fedf2dea1ba91aafdc48.dicom.zip to vinbigdata-chest-xray-abnormalities-detection/test






Archive:  vinbigdata-chest-xray-abnormalities-detection/test/022a62478444fedf2dea1ba91aafdc48.dicom.zip
  inflating: vinbigdata-chest-xray-abnormalities-detection/test/022a62478444fedf2dea1ba91aafdc48.dicom  


### Get PNG images resized to 512x512

In [None]:
%cd /kaggle/input

dataset_name = "vinbigdata"
user_name = "xhlulu"

if Path(dataset_name).exists():
    shutil.rmtree(dataset_name)

kaggle.dataset_download_files(f"{user_name}/{dataset_name}", path=dataset_name, force=True, unzip=True, quiet=False)

  0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading vinbigdata.zip to vinbigdata


100%|██████████| 1.94G/1.94G [00:20<00:00, 102MB/s]





### Get PNG in original size
Only download the meta files for now.

In [None]:
%cd /kaggle/input

dataset_name = "vinbigdata-chest-xray-original-png"
user_name = "corochann"

dataset_files_include = ["train_meta.csv", "test_meta.csv"]

if Path(dataset_name).exists():
    shutil.rmtree(dataset_name)

for file_name in dataset_files_include:
    out_file = Path(dataset_name, file_name)
    kaggle.dataset_download_file(f"{user_name}/{dataset_name}", file_name, path=out_file.parent)
    try_unzip(out_file)

/content/drive/MyDrive/kaggle/input


## Upload

In [6]:
#@title User configuration
my_user_name = "witalia" #@param {type:"string"}

### Upload YOLO dataset

In [7]:
#@title YOLO dataset
%cd /kaggle/output/

dataset_name = "vbdyolo"
ensure_metadata_present(dataset_name, my_user_name, dataset_name)
version_notes = "Use all the labels from all rediologists without filtering" #@param {type:"string"}

if kaggle.datasets_status(my_user_name, dataset_name) is None:
    kaggle.dataset_create_new(dataset_name, dir_mode="zip")
else:
    kaggle.dataset_create_version(dataset_name, version_notes, dir_mode="zip")

print("Status:", kaggle.datasets_status(my_user_name, dataset_name))

/content/drive/MyDrive/kaggle/output


  0%|          | 0.00/811M [00:00<?, ?B/s]

Starting upload for file images.zip


100%|██████████| 811M/811M [00:14<00:00, 59.5MB/s]


Upload successful: images.zip (811MB)


  0%|          | 0.00/2.09M [00:00<?, ?B/s]

Starting upload for file labels.zip


100%|██████████| 2.09M/2.09M [00:02<00:00, 805kB/s]
  0%|          | 0.00/321 [00:00<?, ?B/s]

Upload successful: labels.zip (2MB)
Starting upload for file vbd-dataset.yaml


100%|██████████| 321/321 [00:02<00:00, 158B/s]  


Upload successful: vbd-dataset.yaml (321B)
Status: ready


### Upload YOLO model and predictions

In [None]:
#@title YOLO out dataset
%cd /kaggle/output/

dataset_name = "vbdyolo-out"
ensure_metadata_present(dataset_name, my_user_name, dataset_name)
version_notes = "YOLOv5 model and prediction labels (600 epochs)" #@param {type:"string"}

if kaggle.datasets_status(my_user_name, dataset_name) is None:
    kaggle.dataset_create_new(dataset_name, dir_mode="zip")
kaggle.dataset_create_version(dataset_name, version_notes, dir_mode="zip")

print("Status:", kaggle.datasets_status(my_user_name, dataset_name))

  0%|          | 0.00/13.8M [00:00<?, ?B/s]

/content/drive/MyDrive/kaggle/output
Starting upload for file yolov5_xray_best.pt


100%|██████████| 13.8M/13.8M [00:00<00:00, 45.7MB/s]


Upload successful: yolov5_xray_best.pt (14MB)


  0%|          | 0.00/293k [00:00<?, ?B/s]

Starting upload for file labels_pred.zip


100%|██████████| 293k/293k [00:00<00:00, 1.53MB/s]
  0%|          | 0.00/22.0 [00:00<?, ?B/s]

Upload successful: labels_pred.zip (293KB)
Starting upload for file .ipynb_checkpoints.zip


100%|██████████| 22.0/22.0 [00:00<00:00, 85.3B/s]


Upload successful: .ipynb_checkpoints.zip (22B)
Status: ready


### Upload source as dependencies

In [None]:
#@title Source dependencies
%cd /content/drive/MyDrive/Colab\ Notebooks/kaggle

version_notes = "YOLOv5 code added" #@param {type:"string"}

kaggle.dataset_create_version("vinbigdata-chest-xray-abnormalities-detection", version_notes, dir_mode="zip")

/content/drive/MyDrive/Colab Notebooks/kaggle


  0%|          | 0.00/521 [00:00<?, ?B/s]

Starting upload for file vbd-dataset.yaml


100%|██████████| 521/521 [00:00<00:00, 1.57kB/s]
  0%|          | 0.00/2.59k [00:00<?, ?B/s]

Upload successful: vbd-dataset.yaml (521B)
Starting upload for file dicomMetadata.csv


100%|██████████| 2.59k/2.59k [00:00<00:00, 7.10kB/s]
  0%|          | 0.00/1.63k [00:00<?, ?B/s]

Upload successful: dicomMetadata.csv (3KB)
Starting upload for file parse_dicom_metadata.py


100%|██████████| 1.63k/1.63k [00:00<00:00, 4.03kB/s]
  0%|          | 0.00/3.52k [00:00<?, ?B/s]

Upload successful: parse_dicom_metadata.py (2KB)
Starting upload for file pathologies.csv


100%|██████████| 3.52k/3.52k [00:00<00:00, 8.47kB/s]
  0%|          | 0.00/1.69M [00:00<?, ?B/s]

Upload successful: pathologies.csv (4KB)
Starting upload for file vbd-chest-xray-eda.ipynb


100%|██████████| 1.69M/1.69M [00:00<00:00, 1.98MB/s]
  0%|          | 0.00/2.60k [00:00<?, ?B/s]

Upload successful: vbd-chest-xray-eda.ipynb (2MB)
Starting upload for file vbd_prepare_yolo_dataset.py


100%|██████████| 2.60k/2.60k [00:00<00:00, 7.55kB/s]
  0%|          | 0.00/27.0M [00:00<?, ?B/s]

Upload successful: vbd_prepare_yolo_dataset.py (3KB)
Starting upload for file vbd-yolov5.ipynb


100%|██████████| 27.0M/27.0M [00:01<00:00, 26.2MB/s]
  0%|          | 0.00/20.7k [00:00<?, ?B/s]

Upload successful: vbd-yolov5.ipynb (27MB)
Starting upload for file vbd-kaggle-data.ipynb


100%|██████████| 20.7k/20.7k [00:00<00:00, 47.7kB/s]


Upload successful: vbd-kaggle-data.ipynb (21KB)


https://www.kaggle.com/witalia/vinbigdatachestxraydeps

## Submit

### Submit file to competition

In [None]:
#@title Submit
%cd /kaggle/output/

submission_notes = "YOLOv5 600 epochs correct post-processing" #@param {type:"string"}
submission_file = Path("vbdsubmit", "submission.csv")

if submission_file.exists():
    kaggle.competition_submit(submission_file, submission_notes, "vinbigdata-chest-xray-abnormalities-detection")
else:
    print(f"No submission file {submission_file}")

  0%|          | 0.00/179k [00:00<?, ?B/s]

/content/drive/MyDrive/kaggle/output


100%|██████████| 179k/179k [00:00<00:00, 568kB/s]  


### See latest submission

In [None]:
kaggle.competitions_submissions_list("vinbigdata-chest-xray-abnormalities-detection")[0]

{'date': '2021-03-11T18:49:12.543Z',
 'description': 'YOLOv5 600 epochs correct post-processing',
 'errorDescription': None,
 'fileName': 'submission.csv',
 'privateScore': None,
 'publicScore': '0.075',
 'ref': 19930087,
 'status': 'complete',
 'submittedBy': 'Vitalii',
 'submittedByRef': 'witalia',
 'teamName': 'Witalia',
 'totalBytes': 183208,
 'type': 'standard',
 'url': 'https://www.kaggle.com/submissions/19930087/19930087.raw'}