<a href="https://colab.research.google.com/github/PacktPublishing/Hands-On-Computer-Vision-with-Detectron2/blob/main/Chapter10/Detectron2_Chapter10_DataProcessing.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 10 - Data Processing

## Download data
Dataset Source: Cheng, Jun (2017): brain tumor dataset. figshare. Dataset. https://doi.org/10.6084/m9.figshare.1512427.v5 

In [None]:
# use this if the any of the next cell brings locale error (Python 3.9)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [1]:
!wget --quiet https://figshare.com/ndownloader/articles/1512427/versions/5 -O segbraintumors.zip
data_folder = "segbraintumors"
!unzip -q segbraintumors.zip -d {data_folder}

sets = ["1-766", "767-1532", "1533-2298", "2299-3064"]
sets = ["brainTumorDataPublic_"+_ for _ in sets]

!unzip -q {data_folder}/{sets[0]}.zip -d {data_folder}
!unzip -q {data_folder}/{sets[1]}.zip -d {data_folder}
!unzip -q {data_folder}/{sets[2]}.zip -d {data_folder}
!unzip -q {data_folder}/{sets[3]}.zip -d {data_folder}

## Create COCO dataset

In [2]:
import json
def create_headers():
  data_json = {
  "info":{
    "description": "Desc"
  },
  "licenses":[{
    "id":1,
    "url":"https://creativecommons.org/licenses/by/4.0/",
    "name":"CC BY 4.0"
    }],
  "categories":[
    {"id":0,"name":"tumors","supercategory":"none"},
    {"id":1,"name":"meningioma","supercategory":"tumors"},
    {"id":2,"name":"glioma","supercategory":"tumors"},
    {"id":3,"name":"pituitary","supercategory":"tumors"}
    ],
  "images": [],
  "annotations":[]
  }
  return data_json

In [3]:
import numpy as np
def segmentation2bbox(segmentation):
  segmentation = np.array(segmentation)
  segmentation = segmentation.reshape((-1, 2))
  x_min = segmentation[:, 0].min()
  y_min = segmentation[:, 1].min()
  x_max = segmentation[:, 0].max()
  y_max = segmentation[:, 1].max()
  width = x_max - x_min
  height= y_max - y_min
  bbox = [x_min, y_min, width, height]
  return bbox

def bbox2area(bbox):
  return bbox[2]*bbox[3]

In [4]:
def create_image_obj(id, file_name, height, width):
  return {
      "id": id,
      "license": 1,
      "file_name": file_name,
      "height": height,
      "width": width
  }

def create_annotation_obj(id, 
                          image_id, 
                          category_id, 
                          segmentation, 
                          bbox):
  iscrowd = 0
  area = bbox2area(bbox)
  return {
      "id": id,
      "image_id": image_id,
      "category_id": int(category_id),
      "segmentation": [segmentation.tolist()],
      "bbox": bbox,
      "iscrowd": iscrowd,
      "area": area
  }

In [5]:
import os
import h5py
from tqdm import tqdm
import cv2

output_folder = data_folder+"_coco"
os.makedirs(output_folder, exist_ok = True)
data_json = create_headers()

for i in tqdm(range(1, 3064+1)):
  with h5py.File(f'{data_folder}/{i}.mat', 'r') as f:
    obj = f['cjdata']
    # Step 1: extract image and write it to file
    image = obj['image'][:, :].astype('float64')
    image = (image/image.max())*255.0
    file_name = f"{i}.jpg"
    cv2.imwrite(os.path.join(output_folder, file_name), image)
    # Step 2: create JSON object for image and append it
    height, width = image.shape[:2]
    data_json["images"].append(
        create_image_obj(
          id        = i, 
          file_name = file_name,
          height    = height,
          width     = width
    ))

    # Step 3: extract boundaries + labels then append them
    label = obj['label'][:, :]
    tumorBorder = obj['tumorBorder'][:, :]
    for j, lbl in enumerate(label):
      segmentation = tumorBorder[j].reshape((-1, 2))[:, [1, 0]].reshape((-1))
      bbox = segmentation2bbox(segmentation)
      data_json["annotations"].append(
          create_annotation_obj(
            id            = i,
            image_id      = i,
            category_id   = lbl[0],
            bbox          = bbox,
            segmentation  = segmentation
      ))


100%|██████████| 3064/3064 [00:33<00:00, 90.55it/s]


In [6]:
af = "_annotations.coco.json"
with open(os.path.join(output_folder, af), "w") as f:
  json.dump(data_json, f)

## Perform Train/Test Split

In [7]:
import json
import os

def create_json(info, 
                licenses, 
                categories, 
                images, 
                annotations,
                file_name):
  obj = {
      "info"        : info,
      "licenses"    : licenses, 
      "categories"  : categories, 
      "images"      : images, 
      "annotations" : annotations
  }
  with open(file_name, "w") as f:
    json.dump(obj, f)
  print(f"Saved {file_name}")

In [8]:
name_ds = output_folder
af = "_annotations.coco.json"
with open(os.path.join(name_ds, af), "r") as f:
  annotations_json = json.load(f)

info        = annotations_json["info"]
licenses    = annotations_json["licenses"]
categories  = annotations_json["categories"]
images      = annotations_json["images"]
annotations = annotations_json["annotations"]

In [9]:
from sklearn.model_selection import train_test_split
stratify = [i['category_id'] for i in annotations]
test_size = 0.1
images_train, images_test, annotations_train, annotations_test = train_test_split(
     images, 
     annotations,
     test_size    = test_size,
     stratify     = stratify,
     random_state = 42
)

In [10]:
import shutil
from tqdm import tqdm
train_path  = os.path.join(name_ds, "train")
test_path   = os.path.join(name_ds, "test")
train_af    = os.path.join(train_path, af)
test_af     = os.path.join(test_path, af)

os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

# move images
for img in tqdm(images_train):
  frm = os.path.join(name_ds, img["file_name"])
  to  = train_path
  shutil.move(frm, to)

for img in tqdm(images_test):
  frm = os.path.join(name_ds, img["file_name"])
  to  = test_path
  shutil.move(frm, to)

# write annotations
create_json(info, 
            licenses, 
            categories, 
            images_train, 
            annotations_train,
            file_name = train_af)
create_json(info, 
            licenses, 
            categories, 
            images_test, 
            annotations_test,
            file_name = test_af)


100%|██████████| 2757/2757 [00:00<00:00, 18698.62it/s]
100%|██████████| 307/307 [00:00<00:00, 21451.56it/s]


Saved segbraintumors_coco/train/_annotations.coco.json
Saved segbraintumors_coco/test/_annotations.coco.json


In [None]:
# workaround if the next cells bring locale error
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [11]:
!rm {name_ds}/{af}

## Download

In [12]:
!zip -q -r {name_ds}.zip {name_ds}

In [13]:
from google.colab import files
files.download(f"{name_ds}.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>