# Intro

There are many semantic segmentation tools available, and they all require image annotations in one of several specific formats. In this notebook we will create COCO annotations for the Sartoruis dataset. There are many conversion tools available that can convert from COCO to a different target format as well, so COCO is quite versatile.

In [4]:
from pycocotools.coco import COCO
from pycocotools.mask import encode, area, toBbox
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn.model_selection import StratifiedKFold
import glob
from PIL import Image
from skimage import measure
import skimage.io as io
from shapely.geometry import Polygon, MultiPolygon
from tqdm import tqdm
import gc
from os.path import exists
from pathlib import Path

# Download and preparing Dsets

 - [create-coco-annotations-from-scratch](https://www.immersivelimit.com/create-coco-annotations-from-scratch)
 - [pycocotools](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI)

In [5]:
data_dir = './sartorius-cell-instance-segmentation'

TRAIN_CSV = f"{data_dir}/train.csv"
TRAIN_PATH = f"{data_dir}/train"
TEST_PATH = f"{data_dir}/test"
# annFile_path = f"{data_dir}/annotations_train.json"

ROOT = Path(data_dir)
#TRAIN_FILES = Path(TRAIN_PATH)

WIDTH = 704
HEIGHT = 520

# Normalize to resnet mean and std if True.
# RESNET_MEAN = [0.485, 0.456, 0.406]
# RESNET_STD = [0.229, 0.224, 0.225]
IMAGE_RESIZE = (224, 224)
TH = 40

In [6]:
df = pd.read_csv(TRAIN_CSV)

Create a few lists with files, ids and cell type for later use:

In [35]:
# FILE_NAMES = sorted(list(Path(TRAIN_PATH).rglob('*png')))
# cell_type = []
# files = []
# for file_indx in FILE_NAMES:
#     files.append(file_indx.stem)
#     cell_type.append(df[df.id == str(file_indx.stem)].cell_type.iloc[0])

FILE_NAMES = glob.glob('./sartorius-cell-instance-segmentation/train/*.png')
cell_type = []
fids = []
for i in range(len(FILE_NAMES)):
    fid = FILE_NAMES[i].split('/')[-1].split('.')[0]
    fids.append(fid)
    cell_type.append(df[df.id == fid].cell_type.iloc[0])

## Create COCO files

Conversion is pretty slow, going from binary masks to polygons.

Stratify on cell type and create one COCO .json file for train and test per fold.

In [104]:
CATEGORIES = {"shsy5y": 1, "astro":2, "cort": 3}
# ref: https://www.kaggle.com/inversion/run-length-decoding-quick-start
def rle_decode(mask_rle, mask, color=1):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height, width, channels) of array to return
    color: color for the mask
    Returns numpy array (mask)

    '''
    s = mask_rle.split()

    starts = list(map(lambda x: int(x) - 1, s[0::2]))
    lengths = list(map(int, s[1::2]))
    ends = [x + y for x, y in zip(starts, lengths)]

    img = mask.reshape((mask.shape[0] * mask.shape[1]))

    for start, end in zip(starts, ends):
        img[start : end] = color

    return img.reshape(mask.shape)

def create_segmentation(sub_mask):
    contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
    segmentations = []
    for contour in contours:
        # Flip from (row, col) representation to (x, y)
        # and subtract the padding pixel
        for i in range(len(contour)):
            row, col = contour[i]
            contour[i] = (col - 1, row - 1)

        # Make a polygon and simplify it
        if len(contour) > 2:
            poly = Polygon(contour)
            poly = poly.simplify(1.0, preserve_topology=False)
            if not poly.is_empty:
                try: # might fail if polygons are not connected
                    segmentation = np.array(poly.exterior.coords).ravel().tolist()
                    for i in range(len(segmentation)):
                        segmentation[i] = np.clip(segmentation[i], 0, 1e6)
                    segmentations.append(segmentation)
                except:
                    pass

    return segmentations

# https://www.kaggle.com/c/sartorius-cell-instance-segmentation/discussion/291371
def fill_hole(m):
    filled = m.copy()
    pad = np.pad(m, 4)
    lb = measure.label(pad < 0.5, background=0, connectivity=1)
    u, cc = np.unique(lb, return_counts=True)
    if len(u) > 2:
        #print(u, cc)
        lb = lb[4:-4, 4:-4]
        for uu in u[2:]:
            filled[lb == uu] = 1

    return filled


CLEAN_M = './clean-astro-mask/'


def create_single_mask(annotation, img_size, r=None):
    mask = np.zeros(img_size, dtype=np.uint8)
    mask = rle_decode(annotation, mask)
    mask = fill_hole(mask)
    if r is not None:
        mask = mask & r
    return mask

def add_image(df, fid, fpath, tset, aid, status):
    idx = len(tset["images"])+1
    h = df[df.id == fid].height.iloc[0]
    w = df[df.id == fid].width.iloc[0]
    tset['images'].append({"height": int(h),
                           "width": int(w),
                           "id": int(idx),
                           "file_name": f'{status}/{fpath.name}'})#.replace('\\', '/')})
    adf = df[df.id == fid]
    # check for cleaned mask
    ipath = CLEAN_M+fid+'.png'
    if exists(ipath):
        corr = plt.imread(ipath)
        # extract red channel
        r = corr[:,:,0].astype(np.uint8)
    else:
        r = None
    # add each object as segment
    for j in range(len(adf)):
        cat = CATEGORIES[df[df.id == fid].cell_type.iloc[j]]
        # create mask
        m = create_single_mask(df[df.id == fid].annotation.iloc[j], [h, w], r)
        # encode as RLE
        me = encode(np.asfortranarray(m))
        # calc stats
        bbox = toBbox(me).astype(np.int32).tolist()
        a = area(me)
        # Polygons
        poly = create_segmentation(m)
        if len(poly) > 0:
            tset["annotations"].append({"iscrowd": 0,
                                        "image_id": int(idx),
                                        "bbox": bbox,
                                        "segmentation": poly,
                                        "category_id": int(cat),
                                        "id": int(aid),
                                        "area": int(a)})
            aid += 1
    return tset, aid

In [105]:
def create_coco(files, idx,status):
    # define overall structure
    train_set = {"images": [], "categories": [], "annotations": []}
    # define classes
    train_set["categories"].append({"supercategory": "cells", "id": 1, "name": "shsy5y"})
    train_set["categories"].append({"supercategory": "cells", "id": 2, "name": "astro"})
    train_set["categories"].append({"supercategory": "cells", "id": 3, "name": "cort"})

    anno_id = 1 # start annotation ID at 1
    for i in tqdm(range(len(files))):
        train_set, anno_id = add_image(df, idx[i], files[i], train_set, anno_id,status)
    return train_set

In [38]:
K_FOLDS = 5

kf = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=777)
for fold, (train_index, test_index) in enumerate(kf.split(FILE_NAMES, cell_type)):
    # train set
    train_ds = [FILE_NAMES[i] for i in train_index]
    train_fids = [fids[i] for i in train_index]
    tset = create_coco(train_ds, train_fids)
    with open('train_fold_{}.json'.format(fold), 'w') as f:
        json.dump(tset, f, indent=4)
    # test set
    valid_ds = [FILE_NAMES[i] for i in test_index]
    valid_fids = [fids[i] for i in test_index]
    vset = create_coco(valid_ds, valid_fids)
    with open('test_fold_{}.json'.format(fold), 'w') as f:
        json.dump(vset, f, indent=4)

    del tset, vset
    gc.collect()

100%|██████████| 484/484 [13:47<00:00,  1.71s/it]
100%|██████████| 122/122 [03:49<00:00,  1.88s/it]
100%|██████████| 485/485 [14:24<00:00,  1.78s/it]
100%|██████████| 121/121 [03:24<00:00,  1.69s/it]
100%|██████████| 485/485 [14:11<00:00,  1.76s/it]
100%|██████████| 121/121 [03:43<00:00,  1.85s/it]
100%|██████████| 485/485 [14:21<00:00,  1.78s/it]
100%|██████████| 121/121 [03:30<00:00,  1.74s/it]
100%|██████████| 485/485 [14:33<00:00,  1.80s/it]
100%|██████████| 121/121 [03:24<00:00,  1.69s/it]


In [112]:
from sklearn.model_selection import train_test_split

In [113]:
train_files = sorted(list(Path(TRAIN_PATH).rglob('*png')))
test_files = sorted(list(Path(TEST_PATH).rglob('*.png')))
print(f'Number of pictures in train dir: {len(train_files)} pcs')
print()
print(f'Number of pictures in test dir: {len(test_files)} pcs')

Number of pictures in train dir: 606 pcs

Number of pictures in test dir: 3 pcs


In [114]:
# Split dataset to train and val sets
train_pics, val_pics = train_test_split(train_files, test_size=0.1,shuffle=True)

In [115]:
# Extract pic Id from Path
train_pic_id = [path.stem for path in train_pics]
val_pic_id = [path.stem for path in val_pics]
# Print number of files in tran and val sets
print(f'Number of pictures in train set: {len(train_pics)}')
print()
print(f'Number of pictures in val set: {len(val_pics)}')

Number of pictures in train set: 545

Number of pictures in val set: 61


In [116]:
train_set = create_coco(train_pics, train_pic_id,'train')
val_set = create_coco(val_pics, val_pic_id,'train')

100%|██████████| 545/545 [15:57<00:00,  1.76s/it]
100%|██████████| 61/61 [01:52<00:00,  1.84s/it]


In [120]:
all_set = create_coco(train_files, [path.stem for path in train_files],'train')

100%|██████████| 606/606 [17:12<00:00,  1.70s/it]


In [119]:
with open('annotations_train_poly.json', 'w') as f:
        json.dump(train_set, f, indent=4)

with open('annotations_val_poly.json', 'w') as f:
        json.dump(val_set, f, indent=4)

# with open('annotations_all_poly.json', 'w') as f:
#         json.dump(all_set, f, indent=4)