Taken from https://www.kaggle.com/slawekbiel/positive-score-with-detectron-1-3-input-data/notebook - please upvote!

In [1]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.4.tar.gz (106 kB)
[K     |████████████████████████████████| 106 kB 253 kB/s 
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / - \ | / done
[?25h  Getting requirements to build wheel ... [?25l- \ | / done
[?25h    Preparing wheel metadata ... [?25l- \ | / done
Building wheels for collected packages: pycocotools
  Building wheel for pycocotools (PEP 517) ... [?25l- \ | / - \ | / - \ | done
[?25h  Created wheel for pycocotools: filename=pycocotools-2.0.4-cp37-cp37m-linux_x86_64.whl size=273639 sha256=70abeea03bde327695292367e7e0e41230e58f296bfc63c7eea06918de7faf8d
  Stored in directory: /root/.cache/pip/wheels/a3/5f/fa/f011e578cc76e1fc5be8dce30b3eb9fd00f337e744b3bba59b
Successfully built pycocotools
Installing collected packages: pycocotools
Successfully installed pycocotools-2.0.4


In [2]:
# from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import json,itertools
from sklearn.model_selection import GroupKFold

In [3]:
# config
class CFG:
    data_path = '../input/sartorius-cell-instance-segmentation/'
    nfolds = 5

# Functions

In [4]:
# From https://www.kaggle.com/stainsby/fast-tested-rle
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# From https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset
def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

In [5]:
def coco_structure(train_df):
    cat_ids = {name:id+1 for id, name in enumerate(train_df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in train_df.groupby('id').agg('first').iterrows()]
    annotations=[]
    for idx, row in tqdm(train_df.iterrows()):
        mk = rle_decode(row.annotation, (row.height, row.width))
        ys, xs = np.where(mk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        enc =binary_mask_to_rle(mk)
        seg = {
            'segmentation':enc, 
            'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
            'area': int(np.sum(mk)),
            'image_id':row.id, 
            'category_id':cat_ids[row.cell_type], 
            'iscrowd':0, 
            'id':idx
        }
        annotations.append(seg)
    return {'categories':cats, 'images':images,'annotations':annotations}

# Data

Split into folds, create annotations

In [6]:
train_df = pd.read_csv(CFG.data_path + 'train.csv')

gkf = GroupKFold(n_splits = CFG.nfolds)

train_df["fold"] = -1
y = train_df.width.values

for f, (t_, v_) in enumerate(gkf.split(X=train_df, y=y, groups=train_df.id.values)):
    train_df.loc[v_, "fold"] = f
    
fold_id = train_df.fold.copy()
# train_df.drop('fold', axis = 1, inplace = True)

In [7]:
train_df[['id', 'fold', 'annotation']].to_csv('gt_fold.csv', index = False)

In [8]:
all_ids = train_df.id.unique()
# for fold in range(CFG.nfolds):
for fold in range(4,5):    
    train_sample = train_df.loc[fold_id != fold]
    root = coco_structure(train_sample)

    with open('annotations_train_f'+str(fold)+'.json', 'w', encoding='utf-8') as f:
        json.dump(root, f, ensure_ascii=True, indent=4)
        
    valid_sample = train_df.loc[fold_id == fold]

    print('fold ' + str(fold) + ': produced')

0it [00:00, ?it/s]

fold 4: produced


In [9]:
for fold in range(4,5):   
    train_sample = train_df.loc[fold_id == fold]
    root = coco_structure(train_sample)

    with open('annotations_valid_f'+str(fold)+'.json', 'w', encoding='utf-8') as f:
        json.dump(root, f, ensure_ascii=True, indent=4)
        
    valid_sample = train_df.loc[fold_id == fold]

    print('fold ' + str(fold) + ': produced')

0it [00:00, ?it/s]

fold 4: produced


In [10]:
root = coco_structure(train_df)

with open('annotations_train.json', 'w', encoding='utf-8') as f:
    json.dump(root, f, ensure_ascii=True, indent=4)
        


0it [00:00, ?it/s]