# 1. Data loading
https://www.kaggle.com/ihelon/hubmap-exploratory-data-analysis

In [1]:
import os, shutil
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import cv2
import tifffile
from PIL import Image
from sklearn.model_selection import train_test_split

In [15]:
# Global variables
BASE_PATH = "data/"
TRAIN_PATH = os.path.join(BASE_PATH, "train")
WIDTH = 350
HEIGHT = 350
STRIDE = 200
# Create pathes to folders with images and delete everything in there
base_path_image_has_FTU = 'data/my_data/FTU/images/'    
base_path_mask_has_FTU = 'data/my_data/FTU/masks/'
base_path_no_FTU = 'data/my_data/no_FTU/'
#Datbases for images
images =[]
masks = []

print(os.listdir(BASE_PATH))

['.ipynb_checkpoints', 'HuBMAP-20-dataset_information.csv', 'my_data', 'sample_submission.csv', 'test', 'train', 'train.csv']


### Utility functions

In [3]:
# https://www.kaggle.com/paulorzp/rle-functions-run-lenght-encode-decode
def rle2mask(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (width,height) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [
        np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])
    ]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape).T


def read_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"train/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    mask = rle2mask(
        df_train[df_train["id"] == image_id]["encoding"].values[0], 
        (image.shape[1], image.shape[0])
    )
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
        print(f"[{image_id}] Mask shape: {mask.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        mask = cv2.resize(mask, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
            print(f"[{image_id}] Resized Mask shape: {mask.shape}")
        
    return image, mask


def read_test_image(image_id, scale=None, verbose=1):
    image = tifffile.imread(
        os.path.join(BASE_PATH, f"test/{image_id}.tiff")
    )
    if len(image.shape) == 5:
        image = image.squeeze().transpose(1, 2, 0)
    
    if verbose:
        print(f"[{image_id}] Image shape: {image.shape}")
    
    if scale:
        new_size = (image.shape[1] // scale, image.shape[0] // scale)
        image = cv2.resize(image, new_size)
        
        if verbose:
            print(f"[{image_id}] Resized Image shape: {image.shape}")
        
    return image

def plot_image_and_mask(image, mask, image_id):
    plt.figure(figsize=(16, 10))
    
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title(f"Image {image_id}", fontsize=18)
    
    plt.subplot(1, 3, 2)
    plt.imshow(image)
    plt.imshow(mask, cmap="hot", alpha=0.5)
    plt.title(f"Image {image_id} + mask", fontsize=18)    
    
    plt.subplot(1, 3, 3)
    plt.imshow(mask, cmap="hot")
    plt.title(f"Mask", fontsize=18)    
    
    plt.show()
    
    
def plot_grid_image_with_mask(image, mask):
    plt.figure(figsize=(16, 16))
    n_cols = 4
    n_rows = 4
    col_start = 0
    col_w = 2500
    row_start = 0
    row_w = 2500
    for i in range(n_cols):
        for j in range(n_rows):
            plt.subplot(n_cols, n_rows, n_rows * i + j + 1)
            sub_image = image[
                col_start + i * col_w : col_start + (i + 1) * col_w, 
                row_start + j * row_w : row_start + (j + 1) * row_w, 
                :
            ]
            sub_mask = mask[
                col_start + i * col_w : col_start + (i + 1) * col_w, 
                row_start + j * row_w : row_start + (j + 1) * row_w, 
            ]
            plt.imshow(sub_image)
            plt.imshow(sub_mask, cmap="hot", alpha=0.5)
            plt.axis("off")
    plt.show()
    

def plot_slice_image_and_mask(image, mask, start_h, end_h, start_w, end_w):
    plt.figure(figsize=(16, 5))
    
    sub_image = image[start_h:end_h, start_w:end_w, :]
    sub_mask = mask[start_h:end_h, start_w:end_w]
    
    plt.subplot(1, 3, 1)
    plt.imshow(sub_image)
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(sub_image)
    plt.imshow(sub_mask, cmap="hot", alpha=0.5)
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(sub_mask, cmap="hot")
    plt.axis("off")
    
    plt.show()

In [4]:
# Train df
df_train = pd.read_csv(
    os.path.join(BASE_PATH, "train.csv")
)
df_train['id']

0    2f6ecfcdf
1    aaa6a05cc
2    cb2d976f4
3    0486052bb
4    e79de561c
5    095bf7a1f
6    54f2eec69
7    1e2425f28
Name: id, dtype: object

In [5]:
image_ids = np.array(df_train['id'])

# 2. Saving images to files

** Basic idea: **
 1. Get 300x300 piece
 2. Get from this piece this number (sum of second cahnnel of a piece): *np.sum(image[tsart_h:end_h, start_w:end_w, 1]/255)/100000*
 3. If this number is higher than **0.2** and lower than **0.7** (this threshold can be modified) than this piece has part of a kidney. 
 4. If last point is True than sum over mask of this piece. If sum is more than **10** (this threshold can be modified) than save as FTU
     else save as no_FTU.

In [6]:
'''
    Deletes every file in the given folder.
    It's neeeded to clear my_data folders during development
'''
def delete_all_in_folder(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))

In [7]:
"""
    Checks  wheter image has full-zero rows or cols
    
    INPUT:
        image(np.array): original image
        
    OUTPUT:
        True/False: wheter image has full-zero rows or cols
"""
def check_for_full_zero(image):
    if ((image[0,:,:] == 0).all()) | ((image[:,0,:] == 0).all()) | ((image[image.shape[0]-1,:,:] == 0).all()) | ((image[:,image.shape[1]-1,:] == 0).all()):
        return True
    return False

In [8]:
"""
    Slices and saves to files image and mask into pieces with given size and stride
    
    INPUT:
        image(np.array): full image of a kidney
        mask(np.array): full mask of a kidney
        stride(int): stride with which slices should be created (how big is a step between slices)
        slice_width(int): width of a single slice
        slice_height(int): height of a single slice
"""
def cut_and_save_image_and_mask_to_file(image, mask, slice_height=400, slice_width = 400, stride=200, save = False):
    num_h = int((image.shape[0] - slice_height)/stride)     #number of slices horizontally
    num_w = int((image.shape[1] - slice_width)/stride)      #number of slices vertically
    
    #Variables for databases
    zeros = np.zeros((HEIGHT, WIDTH)) 
    
    for i in range(0, num_h):
        for j in range(0, num_w):
            i_stride = i*stride    # i-th index for image slice with respect to stride
            j_stride = j*stride    # j-th index for image slice with respect to stride
            temp_image_slice = image[i_stride:i_stride+slice_height, j_stride:j_stride+slice_width, :]
            
            #Check temp_image_slice for a threshold
            first_threshold = np.sum(temp_image_slice[:,:,1]/255)/100000
            if ((first_threshold > 0.2) & (first_threshold < 0.7) & (not(check_for_full_zero(temp_image_slice)))):
                                
                has_FTU = False   #varieble to check wether slice has FTU or not
                #Check second threshold
                temp_mask_slice = mask[i_stride:i_stride+slice_height, j_stride:j_stride+slice_width]
                second_threshold = np.sum(temp_mask_slice)
                if (second_threshold > 50):
                    has_FTU = True
                    
                #Save slice
                if (has_FTU):
                    #Adding image to database
                    images.append(temp_image_slice)
                    masks.append(temp_mask_slice)
                    
                    if (save):
                        #Saving image to file
                        global index_FTU
                        temp_mask_slice = temp_mask_slice*255    #converting mask image from [0..1] encoding to [0..255]
                        slice_name = base_path_image_has_FTU + 'image_has_FTU_' + str(index_FTU) + ".jpeg"   
                        maks_name = base_path_mask_has_FTU + 'mask_has_FTU_' + str(index_FTU) + ".jpeg" 
                        Image.fromarray(temp_image_slice).save(slice_name)
                        Image.fromarray(temp_mask_slice).save(maks_name)
                        index_FTU+=1
                else:
                    if (save):
                        #Saving image to file
                        global index_no_FTU
                        slice_name = base_path_no_FTU + 'image_no_FTU_' + str(index_no_FTU) + ".jpeg"   
                        temp_mask_slice = temp_mask_slice*255    #converting mask image from [0..1] encoding to [0..255]
                        Image.fromarray(temp_image_slice).save(slice_name)
                        index_no_FTU+=1
                    
                    #Adding image to database
                    images.append(temp_image_slice)
                    masks.append(zeros)

In [13]:
'''
    Uses read_image and cut_and_save_image_and_mask_to_file functions to save images to files
'''
def get_images_cut_and_saved(slice_height=300, slice_width=300, stride=100, save = False):
    #If save is called than clean the saving directories
    if (save):
        delete_all_in_folder(base_path_image_has_FTU)
        delete_all_in_folder(base_path_mask_has_FTU)
        delete_all_in_folder(base_path_no_FTU)
    for image_id in image_ids:
        print(f'Working with image {image_id}.')
        image, mask = read_image(image_id)
        start_time_for_one_image = time.time()
        cut_and_save_image_and_mask_to_file(image, mask, slice_height, slice_width, stride, save = save)
        # Outputting how long it took to cut and save particular image
        print(f"Time taken to cut and save image {image_id}: {round((time.time() - start_time_for_one_image))}s" )
        print()

In [16]:
# Indexes of images in folders
index_no_FTU = 0
index_FTU = 0

get_images_cut_and_saved(WIDTH, HEIGHT, STRIDE, True)

Working with image 2f6ecfcdf.
[2f6ecfcdf] Image shape: (31278, 25794, 3)
[2f6ecfcdf] Mask shape: (31278, 25794)
Time taken to cut and save image 2f6ecfcdf: 49s

Working with image aaa6a05cc.
[aaa6a05cc] Image shape: (18484, 13013, 3)
[aaa6a05cc] Mask shape: (18484, 13013)
Time taken to cut and save image aaa6a05cc: 19s

Working with image cb2d976f4.
[cb2d976f4] Image shape: (34940, 49548, 3)
[cb2d976f4] Mask shape: (34940, 49548)
Time taken to cut and save image cb2d976f4: 119s

Working with image 0486052bb.
[0486052bb] Image shape: (25784, 34937, 3)
[0486052bb] Mask shape: (25784, 34937)
Time taken to cut and save image 0486052bb: 65s

Working with image e79de561c.
[e79de561c] Image shape: (16180, 27020, 3)
[e79de561c] Mask shape: (16180, 27020)
Time taken to cut and save image e79de561c: 54s

Working with image 095bf7a1f.
[095bf7a1f] Image shape: (38160, 39000, 3)
[095bf7a1f] Mask shape: (38160, 39000)
Time taken to cut and save image 095bf7a1f: 118s

Working with image 54f2eec69.
[5

In [17]:
len(images)

73191

In [18]:
#Save a new database
images = np.array(images)
masks = np.array(masks)
with open('X.npy', 'wb') as f:
    np.save(f, images)
with open('Y.npy', 'wb') as q:
    np.save(q, masks)

MemoryError: Unable to allocate 25.1 GiB for an array with shape (73191, 350, 350, 3) and data type uint8

In [None]:
index_FTU