In [1]:
import os
import imageio
import numpy as np
import pandas as pd
import geopandas as gp
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
DATA = '../data'
IMAGES_PATH = 'images'
MASKS_PATH = 'masks'
INSTANCES_PATH = 'instance_masks'
WIDHT , HEIGHT = 224, 224
CHANNELS = 3

In [72]:
def get_data_pathes(
    datasets_path, images_path_name='images',
    masks_path_name='masks', instances_path_name='instance_masks'):
    
    datasets = list(os.walk(datasets_path))[0][1]
    data_pathes = []
    for dataset in datasets:
        data_pathes.append((
            os.path.join(datasets_path, dataset, images_path_name),
            os.path.join(datasets_path, dataset, masks_path_name),
            os.path.join(datasets_path, dataset, instances_path_name)))
    
    return data_pathes


def get_instances(instances_path):
    return list(os.walk(instances_path))[0][1]


def image2mask(image_path, image_type):
    return imageio.imread('{}.{}'.format(image_path, image_type))


def get_data(images_path, masks_path, instances, image_type='jpeg', mask_type='png'):
    X = np.array([
         image2mask(os.path.join(images_path, i), image_type)for i in instances])
    y = np.array([
        image2mask(os.path.join(masks_path, i), mask_type)for i in instances])
    y = y.reshape([*y.shape, 1])
    
    return X, y


def get_area(instance_path):
    return (gp.read_file(instance_path)['geometry'].area / 100).median()

    
def get_labels(distr):
    res = np.full(distr.shape, 3)
    res[distr < np.quantile(distr, 0.75)] = 2
    res[distr < np.quantile(distr, 0.5)] = 1
    res[distr < np.quantile(distr, 0.25)] = 0
    return res


def stratify(datasets_path, test_size=0.2):
    datasets = get_data_pathes(datasets_path)
    images_path, masks_path, instances_path = datasets[0]
    instances = list(os.walk(instances_path))[0][1]
    X, _ = get_data(images_path, masks_path, instances)
    areas = np.array([
        get_area(os.path.join(instances_path, i, i + '.geojson')) for i in instances])
    labels = get_labels(areas)

    sss = StratifiedShuffleSplit(
        n_splits=len(datasets), test_size=test_size, random_state=42)
    
    return sss.split(X, labels)


def build_generator(datasets_path):
    stratified_ix = stratify(datasets_path)
    datasets = get_data_pathes(datasets_path)
    for images_path, masks_path, instances_path in datasets:
        instances = list(os.walk(instances_path))[0][1]
        X, y = get_data(images_path, masks_path, instances)
        for train_ix, test_ix in stratify(DATA):
            X_train, X_test = X[train_ix], X[test_ix]
            y_train, y_test = y[train_ix], y[test_ix]
            
        yield X_train
    

In [73]:
images_path, masks_path, instances_path = get_data_pathes(DATA)[0]
instances = get_instances(instances_path)
X, y = get_data(images_path, masks_path, instances)
for train_ix, test_ix in stratify(DATA):
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')

(196, 224, 224, 3)
(196, 224, 224, 1)
(49, 224, 224, 3)
(49, 224, 224, 1)


In [71]:
def build_batch_generator(filenames, img_data_dir, shuffle, img_type, batch_size=32, seed=10):

    while True:
        if shuffle:
            filenames = sklearn.utils.shuffle(filenames)

        for start in range(0, len(filenames), batch_size):
            images = []
            masks = []
            end = min(start + batch_size, len(filenames))
            train_batch = filenames[start:end]

            for ind, filename in train_batch.iterrows():
                img_path = os.path.join(img_data_dir, "images", filename['image_name'], '{}'.format(img_type))
                mask_path = os.path.join(img_data_dir, "masks", filename['image_name'], '{}'.format(img_type))

                img = img_to_array(load_img(os.path.join(img_path, filename['name']), grayscale=False))
                mask = img_to_array(load_img(os.path.join(mask_path, filename['name']), grayscale=False))

                images.append(img)
                masks.append(mask)

            images = np.array(images, np.float32)
            masks = np.array(masks, np.float32)

            yield images, masks

In [None]:
train_df = pd.read_csv(args.train_df)
val_df = pd.read_csv(args.val_df)

train_generator = build_batch_generator(
    filenames=train_df,
    img_data_dir=args.dataset_path,
    shuffle=True,
    img_type="png",
    batch_size=32
)

val_generator = build_batch_generator(
    filenames=val_df,
    img_data_dir=args.dataset_path,
    shuffle=True,
    img_type="png",
    batch_size=32
)
model.fit_generator(
    ThreadsafeIter(train_generator),
    steps_per_epoch=len(train_df) / args.batch_size + 1,
    epochs=args.epochs,
    validation_data=ThreadsafeIter(val_generator),
    validation_steps=len(val_df) / args.batch_size + 1,
    callbacks=callbacks,
    max_queue_size=50,
    workers=4)