ref: https://www.pyimagesearch.com/2014/05/26/opencv-python-k-means-color-clustering/

In [7]:
import os
import time
import h5py
import cv2
from collections import Counter
from collections import OrderedDict
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans

from skimage.segmentation import find_boundaries
from skimage.transform import resize
from skimage.filters import gaussian
from skimage.util import montage

from utils import quick_stats
from utils import get_gaussian_mask

%matplotlib inline
%reload_ext autoreload
%autoreload 2

SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE = 16
BIGGEST_SIZE = 20
plt.rc('font', size=BIGGEST_SIZE)         # controls default text sizes
plt.rc('axes', titlesize=BIGGEST_SIZE)    # fontsize of the axes title
plt.rc('axes', labelsize=BIGGEST_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)   # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

MASK_METHOD = 'fullmasks'
CROP_SIZE = 768
ship_dir = "/media/Borg_LS/DATA/geos/airbus/input/"
PATH = Path(ship_dir)
(PATH/f"train_{CROP_SIZE}").mkdir(exist_ok=True)

# montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1)
# montage_rgb = lambda x: np.stack([montage(x[:, :, :, i], padding_width=10, fill=[255, 255, 255]) for i in range(x.shape[3])], -1)
montage_rgb = lambda x: np.stack([montage(x[:, :, :, i]) for i in range(x.shape[3])], -1)
montage_pad = lambda x, *args, **kwargs: montage(x, padding_width=10, *args, **kwargs)

train_image_768_dir = os.path.join(ship_dir, "train_768")
train_mask_768_dir = os.path.join(ship_dir, f"train_masks_768")
h5_file_768 = os.path.join(ship_dir, f"{MASK_METHOD}_768.h5")
train_chip_dir = os.path.join(ship_dir, f"train_{CROP_SIZE}")
train_mask_dir = os.path.join(ship_dir, f"train_masks_{CROP_SIZE}")
clean_mask_file = os.path.join(ship_dir, "train_ship_segmentations_v2_cleaned.csv")
h5_file = os.path.join(ship_dir, f"{MASK_METHOD}_{CROP_SIZE}.h5")
train_hard_negs_file = os.path.join(ship_dir, f"background_{CROP_SIZE}.csv")
train_ships_file = os.path.join(ship_dir, f"{MASK_METHOD}_768_down_{CROP_SIZE}.csv")


In [8]:
masks = pd.read_csv(clean_mask_file)
masks = masks[masks['ImageId'] != '6384c3e78.jpg']
unique_img_ids = masks.groupby('ImageId').size().reset_index(name='counts')
masks = pd.merge(masks, unique_img_ids)
masks['counts'] = masks.apply(lambda c_row: c_row['counts'] if isinstance(c_row['EncodedPixels'], str) else 0, 1)
masks = masks.drop(columns=['EncodedPixels'])
print(masks.shape[0], 'masks found')
print(masks['ImageId'].value_counts().shape[0])
masks.head()

229696 masks found
191987


Unnamed: 0,ImageId,counts
0,00003e153.jpg,0
1,0001124c7.jpg,0
2,000155de5.jpg,1
3,000194a2d.jpg,5
4,000194a2d.jpg,5


In [39]:
def downsample_img(img_ids):
    Image.open(PATH/f"train_{chip_size}"/img_ids[0]).resize((CROP_SIZE, CROP_SIZE)).save(PATH/f"train_{CROP_SIZE}"/img_ids[1])

def downsample_msk(img_ids):
    Image.open(PATH/f"train_masks_{chip_size}"/img_ids[0]).resize((CROP_SIZE, CROP_SIZE)).save(PATH/f"train_masks_{CROP_SIZE}"/img_ids[1])    

## Downsample background images

In [49]:
negs_df = masks[masks['counts'] == 0]
negs_df = negs_df.reset_index(drop=True)

chip_size = 768
src_img_ids = negs_df['ImageId'].values
# tgt_img_ids = [img_id.replace(".jpg", f"_{chip_size}.jpg") for img_id in src_img_ids]
# img_ids = list(zip(src_img_ids, tgt_img_ids))
img_ids = list(zip(src_img_ids, src_img_ids))

In [5]:
with ProcessPoolExecutor(16) as e: e.map(downsample_img, img_ids)

## Downsample 768x768 ship images and masks

In [14]:
ships_df = masks[masks['counts'] > 0]
ships_df = ships_df.drop_duplicates()
ships_df = ships_df.reset_index(drop=True)

chip_size = 768
src_img_ids = ships_df['ImageId'].values
# tgt_img_ids = [img_id.replace(".jpg", f"_{chip_size}.jpg") for img_id in src_img_ids]
# img_ids = list(zip(src_img_ids, tgt_img_ids))
img_ids = list(zip(src_img_ids, src_img_ids))

In [18]:
print(len(ships_df))
ships_df.head()

41988


Unnamed: 0,ImageId,counts,ClusterId
0,000155de5.jpg,1,0
1,000194a2d.jpg,5,0
2,0002756f7.jpg,2,0
3,000532683.jpg,2,0
4,00053c6ba.jpg,1,0


In [16]:
if "ClusterId" not in ships_df:
    null_labels = np.zeros((len(ships_df),), dtype=int)
    ships_df["ClusterId"] = pd.Series(null_labels)
    
ships_df.head()

Unnamed: 0,ImageId,counts,ClusterId
0,000155de5.jpg,1,0
1,000194a2d.jpg,5,0
2,0002756f7.jpg,2,0
3,000532683.jpg,2,0
4,00053c6ba.jpg,1,0


In [17]:
ships_df.to_csv(os.path.join(ship_dir, f"{MASK_METHOD}_768.csv"), index=False)

In [41]:
with ProcessPoolExecutor(16) as e: e.map(downsample_img, img_ids)
with ThreadPoolExecutor(16) as e: e.map(downsample_msk, img_ids)

### Create new dataframe for downsampled images and write masks to h5 store.

In [4]:
n_pixels = defaultdict(int)
r_mean = defaultdict(float)
g_mean = defaultdict(float)
b_mean = defaultdict(float)
r_std = defaultdict(float)
g_std = defaultdict(float)
b_std = defaultdict(float)

t1 = time.time()
n_ships = len(ships_df)
with h5py.File(h5_file, 'a') as mask_h5:
    with h5py.File(h5_file_768, 'r') as mask_768:
        for i, (src_img_id, tgt_img_id) in enumerate(img_ids):
            msk = mask_768[src_img_id][:]
            down_msk = resize(msk, (CROP_SIZE, CROP_SIZE), mode='reflect', anti_aliasing=False) > 0.5
            mask_h5.create_dataset(tgt_img_id, data=down_msk) 
            
#             in_image = cv2.cvtColor(cv2.imread(os.path.join(train_chip_dir, tgt_img_id)), cv2.COLOR_BGR2RGB) / 255.
#             r0, g0, b0 = np.mean(in_image, axis=(0, 1))
#             r1, g1, b1 = np.std(in_image, axis=(0, 1))

#             n_pixels[i] = np.sum(down_msk)
#             r_mean[i] = r0
#             g_mean[i] = g0
#             b_mean[i] = b0
#             r_std[i] = r1
#             g_std[i] = g1
#             b_std[i] = b1

            if i == n_ships - 1 or (i + 1) % 5000 == 0:
                print(i, tgt_img_id, time.time() - t1)
                t1 = time.time()

4999 1eb76f320.jpg 13.589272022247314
9999 3dfa5f509.jpg 14.010015726089478
14999 5bebe4366.jpg 13.560949802398682
19999 7a32c2645.jpg 13.81346869468689
24999 99139f48e.jpg 13.56414270401001
29999 b79699dcc.jpg 14.367603778839111
34999 d60637a3a.jpg 14.468406915664673
39999 f4213aeac.jpg 13.112466812133789
41987 fffdd2377.jpg 6.40229606628418


In [44]:
ships_df['ImageId2'] = pd.Series(tgt_img_ids, index=ships_df.index)
ships_df['r_mean'] = pd.Series(r_mean, index=ships_df.index)
ships_df['g_mean'] = pd.Series(g_mean, index=ships_df.index)
ships_df['b_mean'] = pd.Series(b_mean, index=ships_df.index)
ships_df['r_std'] = pd.Series(r_std, index=ships_df.index)
ships_df['g_std'] = pd.Series(g_std, index=ships_df.index)
ships_df['b_std'] = pd.Series(b_std, index=ships_df.index)
ships_df['n_pixels'] = pd.Series(n_pixels, index=ships_df.index)

In [45]:
ships_df = ships_df.drop(columns=['ImageId'])
ships_df = ships_df.rename(index=str, columns={"ImageId2": "ImageId"})

In [46]:
ships_df.to_csv(train_ships_file, index=False)

### Downsample image chips

In [5]:
if CROP_SIZE == 96:
    chip_sizes = [192, 384]
elif CROP_SIZE == 192:
    chip_sizes = [384]
elif CROP_SIZE == 384:
    chip_sizes = []

In [6]:
with h5py.File(h5_file, 'a') as tgt_h5:
    for chip_size in chip_sizes:
        n_pixels = defaultdict(int)
        r_mean = defaultdict(float)
        g_mean = defaultdict(float)
        b_mean = defaultdict(float)
        r_std = defaultdict(float)
        g_std = defaultdict(float)
        b_std = defaultdict(float)

        chip_file_name = os.path.join(ship_dir, f"{MASK_METHOD}_{chip_size}.csv")
        chip_df = pd.read_csv(chip_file_name)
        src_img_ids = chip_df['ImageId'].values
        tgt_img_ids = [img_id.replace(".jpg", f"_{chip_size}.jpg") for img_id in src_img_ids]
        img_ids = list(zip(src_img_ids, tgt_img_ids))
#         with ThreadPoolExecutor(16) as e: e.map(downsample_img, img_ids)
#         with ThreadPoolExecutor(16) as e: e.map(downsample_msk, img_ids)
        src_h5_file = os.path.join(ship_dir, f"{MASK_METHOD}_{chip_size}.h5")
        with h5py.File(src_h5_file, 'r') as src_h5:
            t1 = time.time()
            n_chips = len(chip_df)
            for i, (src_img_id, tgt_img_id) in enumerate(img_ids):
                msk = src_h5[src_img_id][:]
                down_msk = resize(msk, (CROP_SIZE, CROP_SIZE), mode='reflect', anti_aliasing=False) > 0.5
                tgt_h5.create_dataset(tgt_img_id, data=down_msk) 
                
#                 in_image = cv2.cvtColor(cv2.imread(os.path.join(train_chip_dir, tgt_img_id)), cv2.COLOR_BGR2RGB) / 255.
#                 r0, g0, b0 = np.mean(in_image, axis=(0, 1))
#                 r1, g1, b1 = np.std(in_image, axis=(0, 1))

#                 n_pixels[i] = np.sum(down_msk)
#                 r_mean[i] = r0
#                 g_mean[i] = g0
#                 b_mean[i] = b0
#                 r_std[i] = r1
#                 g_std[i] = g1
#                 b_std[i] = b1

                if i == n_chips - 1 or (i + 1) % 1000 == 0:
                    print(i, tgt_img_id, time.time() - t1)
                    t1 = time.time()
                    
#         chips_df = pd.DataFrame({'ImageId': pd.Series(tgt_img_ids)})
#         chips_df['r_mean'] = chip_df['r_mean']
#         chips_df['g_mean'] = chip_df['g_mean']
#         chips_df['b_mean'] = chip_df['b_mean']
#         chips_df['r_std'] = chip_df['r_std']
#         chips_df['g_std'] = chip_df['g_std']
#         chips_df['b_std'] = chip_df['b_std']
#         chips_df['counts'] = chip_df['counts']
#         chips_df['n_pixels'] = pd.Series(n_pixels)
#         if 'ClusterId' in chip_df:
#             chips_df['ClusterId'] = chip_df['ClusterId']
            
#         train_chips_file = os.path.join(ship_dir, f"{MASK_METHOD}_{chip_size}_down_{CROP_SIZE}.csv")
#         chips_df.to_csv(train_chips_file, index=False)

999 0aa980023_1_192.jpg 1.8720440864562988
1999 1661ed73e_1_192.jpg 2.006088972091675
2999 242207574_3_192.jpg 2.0502736568450928
3999 346307d01_1_192.jpg 1.8277504444122314
4999 45f01f7ca_1_192.jpg 1.810166358947754
5999 5d8ce8ab9_7_192.jpg 1.5730881690979004
6999 7a119fa4c_1_192.jpg 1.8473634719848633
7999 a453f5a1d_1_192.jpg 1.8479399681091309
8999 ef9bd66b9_1_192.jpg 2.0708322525024414
9158 ffbbdef34_7_192.jpg 0.3400712013244629
999 1759957b1_2_384.jpg 1.9918549060821533
1999 30d934a6e_4_384.jpg 2.1106560230255127
2999 4cdd94d31_3_384.jpg 1.9560949802398682
3999 6e71d4259_6_384.jpg 1.8878164291381836
4999 935d1bfc4_1_384.jpg 1.8200135231018066
5999 be1720fc4_1_384.jpg 1.962536334991455
6999 ee94d427e_5_384.jpg 1.9616806507110596
7348 ffdfe7893_1_384.jpg 0.6404290199279785


In [27]:
chip_df.tail()

Unnamed: 0,ImageId0,counts,ImageId,r_mean,g_mean,b_mean,r_std,g_std,b_std,x_min,x_max,y_min,y_max,height,width,area,n_pixels,ship_length,pixel_weight,ClusterId_100
7344,ffbbdef34.jpg,7,ffbbdef34_5.jpg,82.982598,86.722955,54.792643,10.435363,13.182183,20.522512,177,207,168,216,49,31,1519,645,57,1.115417,51
7345,ffbbdef34.jpg,7,ffbbdef34_7.jpg,90.440864,100.636325,76.932075,14.954044,20.820246,34.345033,161,224,181,204,24,64,1536,1326,68,2.293091,77
7346,ffc2a6de6.jpg,1,ffc2a6de6_1.jpg,49.531264,40.106533,24.312832,37.234598,37.122349,36.833301,95,290,167,217,51,196,9996,7563,202,13.078918,12
7347,ffd9998b5.jpg,4,ffd9998b5_2.jpg,52.816481,45.787238,15.134535,13.992977,15.962718,18.974591,190,195,186,199,14,6,84,66,15,0.114136,18
7348,ffdfe7893.jpg,4,ffdfe7893_1.jpg,47.640408,40.34592,11.588203,7.129896,7.665147,8.761654,148,237,176,208,33,90,2970,1779,95,3.076477,40


In [28]:
chips_df.tail()

Unnamed: 0,ImageId,r_mean,g_mean,b_mean,r_std,g_std,b_std,counts,n_pixels,ClusterId_100
7344,ffbbdef34_5_384.jpg,82.982598,86.722955,54.792643,10.435363,13.182183,20.522512,7,772,51
7345,ffbbdef34_7_384.jpg,90.440864,100.636325,76.932075,14.954044,20.820246,34.345033,7,942,77
7346,ffc2a6de6_1_384.jpg,49.531264,40.106533,24.312832,37.234598,37.122349,36.833301,1,1854,12
7347,ffd9998b5_2_384.jpg,52.816481,45.787238,15.134535,13.992977,15.962718,18.974591,4,825,18
7348,ffdfe7893_1_384.jpg,47.640408,40.34592,11.588203,7.129896,7.665147,8.761654,4,422,40
