In [1]:
DATA = '../data'
IMAGES_PATH = 'images'
MASKS_PATH = 'masks'
INSTANCES_PATH = 'instance_masks'
WIDHT , HEIGHT = 224, 224
CHANNELS = 3

In [108]:
import os
import re
import imageio
import argparse
import numpy as np
import pandas as pd
import geopandas as gp
from sklearn.model_selection import StratifiedShuffleSplit


def get_data_pathes(
    datasets_path, images_path_name='images',
    masks_path_name='masks', instances_path_name='instance_masks'
    ):

    datasets = list(os.walk(datasets_path))[0][1]
    data_pathes = []
    for dataset in datasets:
        data_pathes.append((
            os.path.join(datasets_path, dataset, images_path_name),
            os.path.join(datasets_path, dataset, masks_path_name),
            os.path.join(datasets_path, dataset, instances_path_name)))
    
    return data_pathes


def get_instances(instances_path):
    return list(os.walk(instances_path))[0][1]


def image2mask(image_path, image_type):
    return imageio.imread('{}.{}'.format(image_path, image_type))


def get_data(
    images_path, masks_path, instances,
    img_type='tiff', msk_type='png'
    ):

    X = np.array([
        image2mask(os.path.join(images_path, i), img_type) for i in instances])
    y = np.array([
        image2mask(os.path.join(masks_path, i), msk_type)for i in instances])
    y = y.reshape([*y.shape, 1])
    
    return X, y


def get_area(instance_path):
    return (gp.read_file(instance_path)['geometry'].area / 100).median()

    
def get_labels(distr):
    res = np.full(distr.shape, 3)
    res[distr < np.quantile(distr, 0.75)] = 2
    res[distr < np.quantile(distr, 0.5)] = 1
    res[distr < np.quantile(distr, 0.25)] = 0
    return res


def stratify(datasets_path, test_size, random_state):
    images_path, masks_path, instances_path = get_data_pathes(datasets_path)[0]
    instances = list(os.walk(instances_path))[0][1]

    X, _ = get_data(images_path, masks_path, instances)
    areas = np.array([
        get_area(os.path.join(instances_path, i, i + '.geojson')) for i in instances])
    labels = get_labels(areas)

    sss = StratifiedShuffleSplit(
        n_splits=1, test_size=test_size, random_state=random_state)
    
    return sss.split(X, labels)

In [213]:
def get_data_info(dataset_path):
    cols = [
        'name', 'channel', 'position',
        'image_path', 'mask_path', 'instance_path',
        'image_type', 'mask_type'
    ]
    data_info = pd.DataFrame(columns=cols)
    dataset = get_data_pathes(dataset_path)
    for subset in dataset:
        images_path, masks_path, instances_path = subset
        instances = list(os.walk(instances_path))[0][1]
        image_type = list(os.walk(images_path))[0][2][0].split('.')[-1]
        mask_type = list(os.walk(masks_path))[0][2][0].split('.')[-1]

        for i, instance in enumerate(instances):
            instance = instance.split('_')
            name = '_'.join(instance[:2])
            channel = '_'.join(instance[2:-2])
            position = '_'.join(instance[-2:])

            data_info = data_info.append(
                pd.DataFrame({
                    'name': name,
                    'channel': channel,
                    'position': position,
                    'image_path': images_path,
                    'mask_path': masks_path,
                    'instance_path': instances_path,
                    'image_type': image_type,
                    'mask_type': mask_type
                }, index=[0]),
                sort=True, ignore_index=True)
    
    return data_info


def filter_by_channel(data_info, channel_name):
    return data_info[data_info['channel'] == channel_name]


def stratified_split(dataset_path, test_size=0.2, random_state=42):      
    stratified_ix = stratify(dataset_path, test_size, random_state)
    data_info = get_data_info(dataset_path)
    filtered = filter_by_channel(
        data_info,
        data_info['channel'].values[0])
    train_df = pd.DataFrame(columns=data_info.columns)
    test_df = pd.DataFrame(columns=data_info.columns)
    
    for i, (train_ix, test_ix) in enumerate(stratified_ix):
        for channel in data_info['channel'].unique():
            row = data_info.iloc[train_ix]
            row = row.replace(
                row['channel'].values[0],
                channel)
            train_df = train_df.append(
                row, sort=False,
                ignore_index=True)
            
            row = data_info.iloc[test_ix]
            row = row.replace(
                row['channel'].values[0],
                channel)
            test_df = test_df.append(
                row, sort=False,
                ignore_index=True)
    
    return train_df, test_df  
    

In [214]:
train_df, test_df = stratified_split(DATA)

In [215]:
train_df.to_csv('../data/train_df.csv')
test_df.to_csv('../data/test_df.csv')

In [211]:
def build_batch_generator(
    files_df, batch_size=4,
    channels=['rgb', 'ndvi', 'ndvi_color', 'b2']
    ):
    
    if len(channels) == 0:
        raise Exception('You have to set at least 1 channel.')
        
    filtered = filter_by_channel(
        files_df,
        files_df['channel'].values[0])
    while True:
        for start in range(0, filtered.shape[0], batch_size):
            images = []
            masks = []
            end = min(start + batch_size, filtered.shape[0])
            train_batch = filtered.iloc[start:end]

            for _, file in train_batch.iterrows():
                res_image = []
                res_mask = []
                for channel in channels:
                    row = files_df[
                        (files_df['name']==file['name'])
                        & (files_df['channel']==channel)
                        & (files_df['position']==file['position'])
                    ]
                    filename = '_'.join([
                        row['name'].values[0],
                        channel,
                        row['position'].values[0]])
                    image_path = os.path.join(
                        row['image_path'].values[0],
                        '{}.{}'.format(filename, row['image_type'].values[0]))
                    mask_path = os.path.join(
                        row['mask_path'].values[0],
                        '{}.{}'.format(filename, row['mask_type'].values[0]))
                    print(channel, row['image_path'].values[0], sep='\n')
                    return
                    img = imageio.imread(image_path)
                    mask = imageio.imread(mask_path)

                    res_image.append(img)
                    res_mask.append(mask)
                print(res_image)
                return
            
            masks = np.array(masks, np.float32)
            masks = masks.reshape(*masks.shape, 1)
            
            images = np.array(images, np.float32)
            if images.ndim == 3:
                images = images.reshape(*images.shape, 1)

            yield images, masks

In [212]:
gen = build_batch_generator(train_df, 4)
j = 0
a = []
for i in gen:
    a.append(i)
    print(i[0].shape)
    print(i[1].shape)
    break

rgb
../data/20160103_66979721-be1b-4451-84e0-4a573236defd_ndvi/images


In [8]:
import matplotlib.pyplot as plt
plt.imshow(a[1][0][0,:,:,2])
plt.show()
plt.imshow(a[1][1][0,:,:,0])

IndexError: too many indices for array

In [78]:
train_generator = build_batch_generator(train_df, 4)
val_generator = build_batch_generator(test_df, 4)

In [79]:
import threading


def freeze_model(model, freeze_before_layer):
    if freeze_before_layer == "ALL":
        for l in model.layers:
            l.trainable = False
    else:
        freeze_before_layer_index = -1
        for i, l in enumerate(model.layers):
            if l.name == freeze_before_layer:
                freeze_before_layer_index = i
        for l in model.layers[:freeze_before_layer_index]:
            l.trainable = False

class ThreadsafeIter(object):
    def __init__(self, it):
        self.lock = threading.Lock()
        self.it = it.__iter__()

    def __iter__(self): return self

    def __next__(self):
        with self.lock:
            return next(self.it)

In [80]:
from segmentation_models import Unet
from segmentation_models.backbones import get_preprocessing
from segmentation_models.losses import bce_dice_loss
from segmentation_models.metrics import dice_score
import math 

BACKBONE = 'resnet50'
EPOCHS = 10
BATCH = 4
preprocess_input = get_preprocessing(BACKBONE)

model = Unet(BACKBONE, encoder_weights='imagenet')
model.compile('Adam', loss=bce_dice_loss, metrics=[dice_score])

In [81]:
model.fit_generator(
    ThreadsafeIter(train_generator),
    steps_per_epoch=math.ceil(train_df.shape[0] / BATCH),
    epochs = 10,
    validation_data=ThreadsafeIter(val_generator),
    validation_steps=math.ceil(test_df.shape[0] / BATCH),
    max_queue_size=50,
    workers=4
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fed25cb35c0>