In [33]:
import os
import imageio
import argparse
import numpy as np
import pandas as pd
import geopandas as gp

from torchvision import transforms
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
DATA_PATH = '../data'
IMAGES_FOLDER = 'images'
MASKS_FOLDER = 'masks'
INSTANCES_FOLDER = 'instance_masks'
IMAGE_TYPE = 'tiff'
MASK_TYPE = 'png'
INSTANCE_TYPE = 'geojson'
CHANNELS = ['rgb', 'ndvi', 'ndvi_color', 'b2']

In [65]:
def get_data_info(data_path=DATA_PATH):  
    
    dataset = get_folders(data_path)[0]
    _, _, insatnces_path = get_data_pathes(data_path)
    instances = get_folders(insatnces_path)
    
    cols = ['date', 'name', 'ix', 'iy']
    data_info = pd.DataFrame(columns=cols)
    for instance in instances:
        name_parts = split_fullname(instance)
        data_info = data_info.append(
            pd.DataFrame({
                'date': name_parts[0],
                'name': name_parts[1],
                'ix': name_parts[3],
                'iy': name_parts[4]
            }, index=[0]),
            sort=True, ignore_index=True
        )
        
    return data_info


def get_data_pathes(
    data_path=DATA_PATH, images_folder=IMAGES_FOLDER,
    masks_folder=MASKS_FOLDER, instances_folder=INSTANCES_FOLDER
):
    
    dataset = get_folders(data_path)[0]
    
    images_path = os.path.join(data_path, dataset, images_folder)
    masks_path = os.path.join(data_path, dataset, masks_folder)
    insatnces_path = os.path.join(data_path, dataset, instances_folder)
    
    return images_path, masks_path, insatnces_path
    
    
def get_folders(path):
    return list(os.walk(path))[0][1]


def split_fullname(fullname):
    return fullname.split('_')


def get_fullname(*name_parts):
    return '_'.join(tuple(map(str, name_parts)))


def get_filepath(*path_parts, file_type):
    return '{}.{}'.format(join_pathes(*path_parts), file_type)


def join_pathes(*pathes):
    return os.path.join(*pathes)


def stratify(
    data_info, data_path=DATA_PATH, 
    test_size=0.2, random_state=42,
    channel=CHANNELS[0], instance_type=INSTANCE_TYPE,
    instances_folder=INSTANCES_FOLDER
):
    
    X, _ = get_data(data_info)
    areas = []
    for _, row in data_info.iterrows():
        instance_name = get_fullname(row['date'], row['name'], channel, row['ix'], row['iy'])
        instance_path = get_filepath(
            data_path,
            get_fullname(row['date'], row['name'], channel),
            instances_folder,
            instance_name,
            instance_name,
            file_type=instance_type
        )
        areas.append(get_area(instance_path))
                     
    labels = get_labels(np.array(areas))

    sss = StratifiedShuffleSplit(
        n_splits=1,
        test_size=test_size,
        random_state=random_state
    )

    return sss.split(X, labels)


def get_data(
    data_info, channel=CHANNELS[0], data_path=DATA_PATH,
    image_folder=IMAGES_FOLDER, mask_folder=MASKS_FOLDER,
    image_type=IMAGE_TYPE, mask_type=MASK_TYPE
):
    
    x = []
    y = []
    for _, row in data_info.iterrows():
        dataset = get_fullname(row['date'], row['name'], channel)
        filename = get_fullname(row['date'], row['name'], channel, row['ix'], row['iy'])
        
        image_path = get_filepath(
            data_path,
            dataset,
            image_folder,
            filename,
            file_type=image_type
        )
        mask_path = get_filepath(
            data_path,
            dataset,
            mask_folder,
            filename,
            file_type=mask_type
        )
        
        x.append(read_tensor(image_path))
        y.append(read_tensor(mask_path))
        
    x = np.array(x)
    y = np.array(y)
    y = y.reshape([*y.shape, 1])

    return x, y


def read_tensor(filepath):
    return imageio.imread(filepath)


def get_area(instance_path):
    return (gp.read_file(instance_path)['geometry'].area / 100).median()


def get_labels(distr):
    res = np.full(distr.shape, 3)
    res[distr < np.quantile(distr, 0.75)] = 2
    res[distr < np.quantile(distr, 0.5)] = 1
    res[distr < np.quantile(distr, 0.25)] = 0
    return res


def stratified_split(data_info, data_path=DATA_PATH, 
                     test_size=0.2, random_state=42,
                     channel=CHANNELS[0], instance_type=INSTANCE_TYPE,
                     instances_folder=INSTANCES_FOLDER):
    
    stratified_indexes = stratify(
        data_info, data_path=DATA_PATH, 
        test_size=0.2, random_state=42,
        channel=CHANNELS[0], instance_type=INSTANCE_TYPE,
        instances_folder=INSTANCES_FOLDER
    )
    
    for train_ix, test_ix in stratified_indexes:
        train_df = data_info.iloc[train_ix]
        test_df = data_info.iloc[test_ix]
    
    train_df.to_csv(
        get_filepath(data_path, 'train_df', file_type='csv'),
        index=False
    )
    test_df.to_csv(
        get_filepath(data_path, 'test_df', file_type='csv'),
        index=False
    )
    
    return train_df, test_df


def get_input_pair(data_info_row, channels=CHANNELS, data_path=DATA_PATH,
              image_folder=IMAGES_FOLDER, mask_folder=MASKS_FOLDER,
              image_type=IMAGE_TYPE, mask_type=MASK_TYPE):
    
    image_tensors = []
    for channel in channels:
        dataset = get_fullname(
            data_info_row['date'],
            data_info_row['name'],
            channel
        )
        filename = get_fullname(
            data_info_row['date'], data_info_row['name'],
            channel, data_info_row['ix'], data_info_row['iy']
        )
        image_path = get_filepath(
            data_path,
            dataset,
            image_folder,
            filename,
            file_type=image_type
        )
        
        image_tensor = read_tensor(image_path)
        if image_tensor.ndim == 2:
            image_tensor = image_tensor.reshape(*image_tensor.shape, 1)
        
        image_tensors.append(image_tensor)
        
    mask_path = get_filepath(
        data_path,
        dataset,
        mask_folder,
        filename,
        file_type=mask_type
    )

    image = transforms.ToTensor()(np.concatenate(image_tensors, axis=2))
    mask = transforms.ToTensor()(read_tensor(mask_path))

    return {'features': image, 'targets': mask}


def create_loaders(train_df, val_df):
    train_loader = UtilsFactory.create_loader(
        train_df,
        open_fn=get_image,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True)

    valid_loader = UtilsFactory.create_loader(
        val_df,
        open_fn=get_image,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=True)

    loaders = collections.OrderedDict()
    loaders["train"] = train_loader
    loaders["valid"] = valid_loader
    
    return loaders

In [47]:
data_info = get_data_info()
data_info.head()

Unnamed: 0,date,ix,iy,name
0,20160103,20,20,66979721-be1b-4451-84e0-4a573236defd
1,20160103,26,13,66979721-be1b-4451-84e0-4a573236defd
2,20160103,12,22,66979721-be1b-4451-84e0-4a573236defd
3,20160103,16,5,66979721-be1b-4451-84e0-4a573236defd
4,20160103,30,16,66979721-be1b-4451-84e0-4a573236defd


In [48]:
x, y = get_data(data_info)
x.shape

(245, 224, 224, 3)

In [49]:
train_df, test_df = stratified_split(data_info)

In [50]:
train_df = train_df.to_dict('records')
test_df = test_df.to_dict('records')

In [67]:
pair = get_input_pair(train_df[1])

In [71]:
pair['features'].shape

torch.Size([9, 224, 224])