In [1]:
import cv2
import numpy as np
import pandas as pd

import rasterio
from sklearn.model_selection import GroupKFold

from tqdm.notebook import tqdm

from utils import (
    rle_decode,
    make_slices,
    load_image_from_slice,
    is_null_image,
)

from params import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv(TRAIN_CSV)

In [3]:
total = 0
valid = 0
image_sum = []
image_sqsum = []

for i, row in df_train.iterrows():
    image_path = TILE_PATH / row['id']  / 'images'
    mask_path = TILE_PATH / row['id']  / 'masks'
    if not image_path.is_dir():
        Path.mkdir(image_path, exist_ok=True, parents=True)
    if not mask_path.is_dir():
        Path.mkdir(mask_path, exist_ok=True, parents=True)

    tiff_file = TRAIN_PATH / (row['id'] + '.tiff')
    dataset = rasterio.open(tiff_file)

    slices = make_slices(dataset)
    mask_tot = rle_decode(row['encoding'], dataset.shape)

    tk0 = tqdm(enumerate(slices, 1), total=len(slices))
    for idx, slice in tk0:
        total += 1
        tk0.set_postfix(file=row['id'])
        image = load_image_from_slice(dataset, slice)
        if not is_null_image(image):
            valid += 1
            cv2.imwrite(f"{image_path}/{row['id']}_{idx}.png", image)

            mask = mask_tot[slice[0]:slice[1], slice[2]:slice[3]]
            cv2.imwrite(f"{mask_path}/{row['id']}_{idx}.png", mask*255.)

            image_sum.append((image/255.).mean(axis=(0, 1)))
            image_sqsum.append(((image/255.)**2).mean(axis=(0, 1)))

image_mean = np.mean(image_sum, axis=0)
image_std = np.sqrt(np.mean(image_sqsum, axis=0) - image_mean**2)
print(f"{valid} / {total} images saved")
print(f"Mean: {image_mean} \nStdev: {image_std}")

  0%|          | 0/1015 [00:00<?, ?it/s]

  0%|          | 0/1750 [00:00<?, ?it/s]

  0%|          | 0/315 [00:00<?, ?it/s]

  0%|          | 0/2184 [00:00<?, ?it/s]

  0%|          | 0/1610 [00:00<?, ?it/s]

  0%|          | 0/595 [00:00<?, ?it/s]

  0%|          | 0/1131 [00:00<?, ?it/s]

  0%|          | 0/589 [00:00<?, ?it/s]



  0%|          | 0/1892 [00:00<?, ?it/s]



In [None]:
files_list = list(TILE_PATH.glob('./*/images/*.png'))
ids = []
ids_idx = []
ids_path = []

for pth in tqdm(files_list, total=len(files_list)):
    ids.append(pth.parts[-3])
    ids_idx.append(pth.stem)
    ids_path.append(pth)

In [None]:
df_fold = pd.DataFrame.from_dict({'id': ids, 'id_idx': ids_idx, 'file_path': ids_path})

gkf = GroupKFold(n_splits=N_SPLITS)
df_fold['fold'] = -1

for i, (t_idx, v_idx) in enumerate(gkf.split(df_fold, groups=df_fold['id'])):
    df_fold.loc[v_idx, 'fold'] = i

df_fold.to_csv(DATA_PATH / f'df_fold_{WINDOW}_{OVERLAP}.csv', index=False)