# Creating The Dataset (& Saving it to NPY Files)

In [None]:
# General, System & support
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

# Computational
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt

from skimage import filters
from skimage import data, exposure, img_as_float

In [None]:
# Printing versions to check for compatibility
print(cv.__version__)

In [None]:
# Creating a list of the directories of the images and masks corresponding to the different species
# The masks and fish images are stored in subfolders corresponding to the class (numbered from 1 to 23)

data_path = 'data/'
img_folder_name = 'fish_'
msk_folder_name = 'mask_'

image_folders = []
mask_folders = []

for i in tqdm(range(23)):

    if i < 9:
      img_name = data_path + img_folder_name + '0' + str(i+1)
      msk_name = data_path + msk_folder_name + '0' + str(i+1)

    else:
      img_name = data_path + img_folder_name  + str(i+1)
      msk_name = data_path + msk_folder_name + str(i+1)

    # print(img_name)
    # print(msk_name)
    
    image_folders.append(img_name)
    mask_folders.append(msk_name)

print(image_folders)
print(mask_folders)

In [None]:
# Accessing fish images and masks then saving them to an array called data while storing the class to the array species
# The dataset has 27,370 images
data = np.zeros((27370,100,100,4), dtype = np.float16)
species = np.zeros((27370,1), dtype = int)

num = 0

# Iterating over the different folders (one for each species) that contain the imges and masks
for i in range(len(image_folders)):
    
    image_file_names = [f for f in listdir(image_folders[i]) if isfile(join(image_folders[i], f))]
    mask_file_names = [f for f in listdir(mask_folders[i]) if isfile(join(mask_folders[i], f))]
    
    # Iterating and reading the images in the subfolder (then storing it in the array data)
    for j in tqdm(range(len(image_file_names))):
        
        img_file_name = image_folders[i] + "/" + image_file_names[j]
        mask_file_name = mask_folders[i] + "/" + mask_file_names[j]

        img = cv.imread(img_file_name)
        mask_org = cv.imread(mask_file_name)
        
        mask_org_gray = cv.cvtColor(mask_org, cv.COLOR_BGR2GRAY)
        mask_org_gray_resized = cv.resize(mask_org_gray, (100,100)) / 255.0
        
        # Resizing the images to the size required by the network (100 x 100)
        img_resized = cv.resize(img, (100,100)) / 255.0
        
        # The network accepts 100 x 100 x 4 arrays
        # (the 1st three channels are the image RGB channels and the last channel is the mask)
        data[num, :, :, 0:3] = img_resized
        data[num, :, :, 3] = mask_org_gray_resized
        
        species[num,0] = i+1
        num +=1
                
        # Checking if th eimage read is empty as opence does not raise an error
        if img.any() == None:
            print("error")
        
        if mask_org.any() == None:
            print("error")
            
print(num)

In [None]:
# Saving the images to the npy file (we used npy files to make loading/saving the data fast, compact, and effecient)
np.save('data.npy', data)
np.save('species.npy', species)