In [3]:
import os
import time
from collections import Counter
import numpy as np
import cv2
from tqdm import tnrange, tqdm_notebook

ship_dir = "data/"
train_768_dir = os.path.join('.', 'train_768')
train_256_dir = os.path.join(ship_dir, 'train_256')
os.makedirs(train_256_dir, exist_ok=True)

img_ids = os.listdir(train_768_dir)
len(img_ids)

192555

## Create the image tiles

In [None]:
for img_id in tqdm_notebook(img_ids):
    img = cv2.imread(os.path.join(train_768_dir, img_id))
    filebase, fileext = img_id.split('.')
    for i in range(3):
        for j in range(3):
            tile_id = f'{filebase}_{i}{j}.{fileext}'
            tile = img[i * 256:(i + 1) * 256, j * 256:(j + 1) * 256, :]
            cv2.imwrite(os.path.join(train_256_dir, tile_id), tile)

## Check that tiles are exact copies of parent (sanity check on jpg compression)

In [6]:
def fuzzy_diff(tile1, tile2):
    maxab = np.max(np.stack([tile1, tile2]), axis=0)
    a = maxab - tile2
    b = maxab - tile1
    ab = a + b
    return np.sum(ab)


idx = 0
img_matches = Counter()
tile_matches = Counter()
diff_counts = Counter()
for img_id in tqdm_notebook(img_ids):
    idx += 1
    img = cv2.imread(os.path.join(train_768_dir, img_id))
    filebase, fileext = img_id.split('.')
    n_matches = 0
    for i in range(3):
        for j in range(3):
            tile_id = f'{filebase}_{i}{j}.{fileext}'
            tile = cv2.imread(os.path.join(train_256_dir, tile_id))
            if np.all(tile == img[i * 256:(i + 1) * 256, j * 256:(j + 1) * 256, :]):
                tile_matches[(i, j)] += 1
                n_matches += 1
            else:
                diff = fuzzy_diff(tile, img[i * 256:(i + 1) * 256, j * 256:(j + 1) * 256, :])
                diff_counts[diff // 1000] += 1
                
    img_matches[n_matches] += 1

    if idx % 1000 == 0:
        print(f'{idx:>6} {img_matches}')
#         print(f'{"":>6} {tile_matches}')
        print(f'{len(diff_counts):>6} {diff_counts}')
