In [None]:
import os
from collections import Counter
import numpy as np
import cv2
from tqdm import tqdm_notebook
from sdcdup.utils import get_project_root
from sdcdup.utils import get_tile
from sdcdup.utils import fuzzy_diff

%load_ext dotenv
%dotenv

n_tiles = 9
project_root = get_project_root()
train_image_dir = os.path.join(project_root, os.getenv('RAW_DATA_DIR'), 'train_768')
train_tile_dir = os.path.join(project_root, os.getenv('PROCESSED_DATA_DIR'), 'train_256')

os.makedirs(train_tile_dir, exist_ok=True)

img_ids = os.listdir(train_image_dir)
len(img_ids)

## Create the image tiles

In [None]:
for img_id in tqdm_notebook(img_ids):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    filebase, fileext = img_id.split('.')
    for idx in range(n_tiles):
        tile_id = f'{filebase}_{idx}.{fileext}'
        tile = get_tile(img, idx)
        cv2.imwrite(os.path.join(train_tile_dir, tile_id), tile)

## Double check that tiles are near identical copies of parent
This is just a sanity check on jpg compression.  It's not necessary to run this, but it will give you the percentage of tiles that were converted exactly.

In [None]:
n_exact_matches = 0
per_image_matches = [0]*n_tiles
exact_matches = [0]*n_tiles
diff_counts = Counter()
for ii, img_id in enumerate(tqdm_notebook(img_ids)):
    img = cv2.imread(os.path.join(train_image_dir, img_id))
    filebase, fileext = img_id.split('.')
    n_matches_per_image = 0
    for idx in range(n_tiles):
        tile_id = f'{filebase}_{idx}.{fileext}'
        tile = cv2.imread(os.path.join(train_tile_dir, tile_id))
        if np.all(tile == get_tile(img, idx)):
            n_exact_matches += 1
            exact_matches[idx] += 1
            n_matches_per_image += 1
        else:
            diff = fuzzy_diff(tile, get_tile(img, idx))
            diff_counts[int(diff // 10000)] += 1
                
    per_image_matches[n_matches_per_image] += 1

    if ii % 1000 == 0:
        print(f'{ii:>6} {n_exact_matches:>6} {exact_matches} {per_image_matches}')
        
print(f'{diff_counts}')