In [None]:
!pip install numpy pandas tqdm joblib pillow

In [None]:
!apt-get install libvips-dev -y --no-install-recommends --download-only -o dir::cache='./'

!mkdir ./libvips
!mv ./archives/* ./libvips
!rm -rf ./archives
!ls ./libvips

!yes | dpkg -i ./libvips/*.deb

!pip install pyvips
!pip wheel pyvips
!mkdir pyvips
!mv *.whl ./pyvips

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pyvips
import numpy as np
from PIL import Image
import random
import glob
import multiprocessing as mproc
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import gc, time

In [None]:
DATASET_IMAGES = "/home/input/test"
os.environ['VIPS_DISC_THRESHOLD'] = '9gb' #use disk caching instead of memory when the image exceeds 9GB

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

ls = sorted(glob.glob(os.path.join(DATASET_IMAGES, '*.png')))
print(f"found images: {len(ls)}")

for dirname, _, filenames in os.walk(DATASET_IMAGES):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def extract_tiles(raw_dir, dest_dir, size: int = 512, scale: float = 2, drop_thr: float = 0.7) -> list:
    name, _ = os.path.splitext(os.path.basename(raw_dir)) #get image name
    im = pyvips.Image.new_from_file(raw_dir) #load image
    w = h = size
    # https://stackoverflow.com/a/47581978/4521646
    idxs = [(y, y + h, x, x + w) for y in range(0, im.height, h) for x in range(0, im.width, w)]
    files = []
    for k, (y, y_, x, x_) in enumerate(idxs):
        tile = im.crop(x, y, min(w, im.width - x), min(h, im.height - y)).numpy()[..., :3]
        
        # increase tile size to (h,w) for edge tiles
        if tile.shape[:2] != (h, w):
            tile_ = tile
            tile_size = (h, w) if tile.ndim == 2 else (h, w, tile.shape[2])
            tile = np.zeros(tile_size, dtype=tile.dtype)
            tile[:tile_.shape[0], :tile_.shape[1], ...] = tile_
        
        # skip the tile of which empty ratio exceeds drop_thr
        mask_bg = np.sum(tile, axis=2) == 0
        tile[mask_bg, :] = 255
        mask_bg = np.mean(tile, axis=2) > 250
        if np.sum(mask_bg) >= (np.prod(mask_bg.shape) * drop_thr):
            #print(f"skip almost empty tile: {k:06}_{int(x_ / w)}-{int(y_ / h)}")
            continue
        
        p_img = os.path.join(dest_dir, f"{name}_{k:06}_{int(x_ / w)}-{int(y_ / h)}.png")
        # print(tile.shape, tile.dtype, tile.min(), tile.max())
        new_size = int(size * scale), int(size * scale)
        Image.fromarray(tile).resize(new_size, Image.LANCZOS).save(p_img)
        files.append(p_img)
    return files, idxs

def prune_tiles(files: list, max_samples: float = 1.0):
    max_samples = max_samples if isinstance(max_samples, int) else int(len(files) * max_samples)
    random.shuffle(files)
    for file_path in files[max_samples:]:
        os.remove(file_path)

def extract_prune_tiles(
    idx_path_img, dest_dir: str = "/home/output", 
    size: int = 512, scale: float = 2.0, drop_thr: float = 0.7, max_samples: float = None
):
    idx, raw_dir = idx_path_img
    print(f"processing #{idx}: {raw_dir}")
    name, _ = os.path.splitext(os.path.basename(raw_dir))
    dest_dir = os.path.join(dest_dir, name)
    os.makedirs(dest_dir, exist_ok=True)
    tiles, _ = extract_tiles(raw_dir, dest_dir, size, scale, drop_thr)
    if max_samples:
        prune_tiles(tiles, max_samples)
    gc.collect()
    time.sleep(1)

In [None]:
# process the train_images folder
ls = sorted(glob.glob(os.path.join(DATASET_IMAGES, '*.png')))
print(f"found images: {len(ls)}")

ls=ls[270:] #segment tasks into smaller chunk

# this mothed uses an unordered queue, doesn't keep processing order
pool = mproc.Pool(3)
tqdm_bar = tqdm(total=len(ls))
for _ in pool.imap_unordered(extract_prune_tiles, enumerate(ls)):
    tqdm_bar.update()
pool.close()
pool.join()