# Images to .npy
Since Colabs runs slowly through folders of images due to google security checks, this is a seperated notebook to preprocess the Data Science Bowl 2018 images and output .npy files with the image datasets.

In [None]:
import os
import sys
import random
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from tqdm import tqdm
from itertools import chain
from skimage.io import imread, imshow, imread_collection, concatenate_images
from skimage.transform import resize
from skimage.util import crop, pad
from skimage.morphology import label

IMG_SIZE = 256
IMG_CHANNELS = 3
TRAIN_PATH = 'stage1_train/'
TEST_PATH = 'stage2_test/'

warnings.filterwarnings('ignore', category=UserWarning, module='skimage')

In [None]:
train_ids = next(os.walk(TRAIN_PATH))[1]
test_ids = next(os.walk(TEST_PATH))[1]

In [None]:
len(train_ids)

# Load in and Crop the Images
Note: cropping the images is superior to resizing for object detection in the training set.

In [None]:
%%time
X_train = np.zeros((len(train_ids), IMG_SIZE, IMG_SIZE, IMG_CHANNELS), dtype=np.uint8)
Y_train = np.zeros((len(train_ids), IMG_SIZE, IMG_SIZE, 1), dtype=np.bool)
print('Loading and cropping train images and masks')
sys.stdout.flush()
for n, image_id in tqdm(enumerate(train_ids), total=len(train_ids)):
    path = TRAIN_PATH + image_id
    img = imread(path + '/images/' + image_id  + '.png')[0:IMG_SIZE,0:IMG_SIZE,:IMG_CHANNELS]
    X_train[n] = img
    mask = np.zeros((IMG_SIZE, IMG_SIZE, 1), dtype=np.bool)
    for mask_file in next(os.walk(path + '/masks/'))[2]:
        mask_ = imread(path + '/masks/' + mask_file)[0:IMG_SIZE,0:IMG_SIZE]
        mask_ = np.expand_dims(resize(mask_, (IMG_SIZE, IMG_SIZE), mode='constant', 
                                      preserve_range=True), axis=-1)
        mask = np.maximum(mask, mask_)
        Y_train[n] = mask

X_test = np.zeros((len(test_ids), IMG_SIZE, IMG_SIZE, IMG_CHANNELS), dtype=np.uint8)
sizes_test = []
print('Loading test images')
sys.stdout.flush()
for n, image_id in tqdm(enumerate(test_ids), total=len(test_ids)):
    path = TEST_PATH + image_id
    try:
        img = imread(path + '/images/' + image_id + '.png')[:,:,:IMG_CHANNELS]
        sizes_test.append([img.shape[0], img.shape[1]])
        img = resize(img, (IMG_SIZE, IMG_SIZE), mode='constant', preserve_range=True)
        X_test[n] = img
    except:
        print(" Problem with: "+path)


# Check Training Data

In [None]:
x = random.randint(0, len(train_ids))
imshow(X_train[x])
plt.show()
imshow(np.squeeze(Y_train[x]))
plt.show()

# Save into .npy

In [None]:
# np.save('DSB_X_Train_256_Clean_Crop.npy', X_train)
# np.save('DSB_X_Testv2_256.npy', X_test)
# np.save('DSB_Y_Train_256_Clean_Crop.npy', Y_train)
# np.save('DSB_Train_ids.npy', train_ids)
# np.save('DSB_Testv2_ids_256.npy', test_ids)
# np.save('DSB_Testv2_Sizes_256.npy', sizes_test)