**About** : This notebook is used to prepare the data


In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

In [None]:
# !pip install pycocotools
# !pip install mmdet
# !pip install mmcv-full==1.3.8
# NOT NECCESSARY : pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html


## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import torch
import warnings
import pycocotools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm

sys.path.append("../src/")

In [None]:
from params import *
from utils.rle import *
from utils.plots import *
# from data.masks import *

In [None]:
import mmdet
mmdet.__version__

In [None]:
import mmcv
mmcv.__version__

## Data

In [None]:
df = pd.read_csv(DATA_PATH + "train.csv")

In [None]:
df = df.groupby('id').agg(list).reset_index()

In [None]:
for col in df.columns[2:]:
    df[col] = df[col].apply(lambda x: np.unique(x)[0] if len(np.unique(x)) == 1 else np.unique(x))

In [None]:
df.head()

In [None]:
sns.countplot(x=df['cell_type'])
plt.show()

In [None]:
def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo: hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction


def get_img_and_mask(img_path, annotation, width, height, mask_only=False):
    """ Capture the relevant image array as well as the image mask """
    img_mask = np.zeros((len(annotation), height, width), dtype=np.uint16)
    
    
    for i, annot in enumerate(annotation): 
        img_mask[i] = rle_decode(annot, (height, width))
    
    return img_mask

In [None]:
for _ in range(5):
#     idx = np.random.choice(len(df))
    idx = 313

    img = cv2.imread(TRAIN_IMG_PATH + df['id'][idx] + ".png")

    rles = df['annotation'][idx]
    
    mask = rles_to_mask_fix(rles, img.shape[:2], single_channel=False, fix=True)
    
    plt.figure(figsize=(15, 10))
    plot_sample(img, mask, width=1)
    plt.axis(False)
    plt.title(f"{df['id'][idx]} - {df['cell_type'][idx]}")
    plt.show()
    
    break

## Generation

### Function

In [None]:
def prepare_mmdet_data(df, idx, fix=True):
    height, width = df[["height", "width"]].values[idx]
    cell_type = df['cell_type'][idx]
    
    rles = df['annotation'][idx]
    masks = rles_to_mask_fix(rles, (height, width), single_channel=False, fix=fix)

    rles = [pycocotools.mask.encode(np.asfortranarray(m > 0)) for m in masks]

    bboxes = np.array([pycocotools.mask.toBbox(rle) for rle in rles])
    bboxes[:, 2] += bboxes[:, 0]
    bboxes[:, 3] += bboxes[:, 1]
    
    meta = {
        'filename': df['id'][idx] + ".png",
        'width': int(width),
        'height': int(height),
        'cell_type': cell_type,
        'ann': {
            'bboxes': bboxes.astype(int).tolist(),
            'labels': [CELL_TYPES.index(cell_type)] * len(bboxes),
            'masks': rles
        }
    }
    
    return masks, meta

### Test

In [None]:
metas = []

for idx in tqdm(range(len(df))):
    idx = 313
    img = cv2.imread(TRAIN_IMG_PATH + df['id'][idx] + ".png")
    masks, meta = prepare_mmdet_data(df, idx)
    
    metas.append(meta)
    
    if idx > 2:
        break

In [None]:
plt.figure(figsize=(15, 10))
plot_sample(img, masks.max(0), meta['ann']['bboxes'], width=1)
plt.axis(False)
plt.title(f"{df['id'][idx]} - {df['cell_type'][idx]}")
plt.show()

# No multiprocess


In [None]:
metas = []

df = pd.read_csv(DATA_PATH + "train.csv")
df = df.groupby('id').agg(list).reset_index()

for col in df.columns[2:]:
    df[col] = df[col].apply(lambda x: np.unique(x)[0] if len(np.unique(x)) == 1 else np.unique(x))


FIX = True
for idx in tqdm(range(len(df))):
    img = cv2.imread(TRAIN_IMG_PATH + df['id'][idx] + ".png")
    masks, meta = prepare_mmdet_data(df, idx=idx, fix=FIX)
    
    metas.append(meta)
    
meta_df = pd.DataFrame.from_dict(metas)
if len(meta_df) == len(df):
    if FIX:
        meta_df.to_csv(OUT_PATH + "mmdet_data.csv", index=False)
    else:
        meta_df.to_csv(OUT_PATH + "mmdet_data_nofix.csv", index=False)

### Run

In [None]:
from multiprocessing import Pool

FIX = False
p = Pool(processes=4)

df = pd.read_csv(DATA_PATH + "train.csv")
df = df.groupby('id').agg(list).reset_index()
for col in df.columns[2:]:
    df[col] = df[col].apply(lambda x: np.unique(x)[0] if len(np.unique(x)) == 1 else np.unique(x))

def prepare_mmdet_data_(i):
    return prepare_mmdet_data(df, idx=i, fix=FIX)


In [None]:

metas = []
for _, meta in tqdm(p.imap(prepare_mmdet_data_, range(len(df))), total=len(df)):
    metas.append(meta)

meta_df = pd.DataFrame.from_dict(metas)
if len(meta_df) == len(df):
    if FIX:
        meta_df.to_csv(OUT_PATH + "mmdet_data.csv", index=False)
    else:
        meta_df.to_csv(OUT_PATH + "mmdet_data_nofix.csv", index=False)