This note includes dividing the original tiff file into tiles,  
dividing the images into fold using GroupKFold with the name of the original file,  
and saving this information.

# Imports

In [1]:
import cv2
import numpy as np
import pandas as pd

import rasterio
from sklearn.model_selection import GroupKFold

from tqdm.notebook import tqdm

from utils import (
    rle_decode,
    make_slices,
    load_image_from_slice,
    is_null_image,
)

from params import *

import warnings
warnings.filterwarnings('ignore')

# Load dataset
train file contains rle label respect to tiff file name.

In [2]:
df_train = pd.read_csv(TRAIN_CSV)

# Split images and save

In [None]:
total = 0 # Count total slices
valid = 0 # Count valid images
# Calculate mean values for later image normalization.
image_sum = [] # Mean of pixel values for each channel(RGB).
image_sqsum = [] # Square mean of pixel values for each channel(RGB).

for i, row in df_train.iterrows():
    # Create folder for each tiff ids.
    image_path = TILE_PATH / row['id']  / 'images'
    mask_path = TILE_PATH / row['id']  / 'masks'
    if not image_path.is_dir():
        Path.mkdir(image_path, exist_ok=True, parents=True)
    if not mask_path.is_dir():
        Path.mkdir(mask_path, exist_ok=True, parents=True)

    # open tiff file using rasterio package
    tiff_file = TRAIN_PATH / (row['id'] + '.tiff')
    dataset = rasterio.open(tiff_file)

    # make slice and decode rle
    slices = make_slices(dataset)
    mask_tot = rle_decode(row['encoding'], dataset.shape)

    tk0 = tqdm(enumerate(slices, 1), total=len(slices))
    for idx, slice in tk0:
        # for each slice created by `make_slices` function
        total += 1
        tk0.set_postfix(file=row['id'])
        image = load_image_from_slice(dataset, slice)
        # if most of image pixels are white or black, continue,
        # otherwise, do.
        if not is_null_image(image):
            valid += 1
            # save sliced image in tiff id folder.
            cv2.imwrite(f"{image_path}/{row['id']}_{idx}.png", image)

            # save sliced masked same coordinates of current image.
            mask = mask_tot[slice[0]:slice[1], slice[2]:slice[3]]
            cv2.imwrite(f"{mask_path}/{row['id']}_{idx}.png", mask*255.)

            # record channel mean values
            image_sum.append((image/255.).mean(axis=(0, 1)))
            image_sqsum.append(((image/255.)**2).mean(axis=(0, 1)))

# Calculate mean and standard deviation for later normalization.
image_mean = np.mean(image_sum, axis=0)
image_std = np.sqrt(np.mean(image_sqsum, axis=0) - image_mean**2)
print(f"{valid} / {total} images saved")
print(f"Mean: {image_mean} \nStdev: {image_std}")

# Create Folds

Get image path and tiff id from sliced images to create dataframe. Execute GroupKFold based on file name. This is because images from the same original tiff file may appear as data leakage.

In [None]:
files_list = list(TILE_PATH.glob('./*/images/*.png'))
ids = []
ids_idx = []
ids_path = []

for pth in tqdm(files_list, total=len(files_list)):
    ids.append(pth.parts[-3])
    ids_idx.append(pth.stem)
    ids_path.append(pth)

In [None]:
df_fold = pd.DataFrame.from_dict({'id': ids, 'id_idx': ids_idx, 'file_path': ids_path})

gkf = GroupKFold(n_splits=N_SPLITS)
df_fold['fold'] = -1

for i, (t_idx, v_idx) in enumerate(gkf.split(df_fold, groups=df_fold['id'])):
    df_fold.loc[v_idx, 'fold'] = i

df_fold.to_csv(DATA_PATH / f'df_fold_{WINDOW}_{OVERLAP}.csv', index=False)