# Creating The Augmented Dataset (Saving it to NPY Files)
This script will create the dataset that will contain all images (original and augmented) and save them to an .npy file. This file is the one that will be used in the script that train the classification model.

In [1]:
# General, System & support
from os import listdir
from os.path import isfile, join, exists
from tqdm import tqdm

# Computational
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt

from skimage import filters
from skimage import data, exposure, img_as_float

In [2]:
# Creating a list of the directories of the images and masks corresponding to the different species

data_path = 'augmented_data/'
img_folder_name = 'fish_'
msk_folder_name = 'mask_'

image_folders = []
mask_folders = []

for i in tqdm(range(26)):

    if i < 9:
      img_name = data_path + img_folder_name + '0' + str(i+1)
      msk_name = data_path + msk_folder_name + '0' + str(i+1)

    else:
      img_name = data_path + img_folder_name  + str(i+1)
      msk_name = data_path + msk_folder_name + str(i+1)

    # print(img_name)
    # print(msk_name)
    
    if exists(img_name):
        image_folders.append(img_name)
        mask_folders.append(msk_name)
    else:
        print(img_name)

print(image_folders)
print(mask_folders)

100%|█████████████████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 324.94it/s]

augmented_data/fish_20
['augmented_data/fish_01', 'augmented_data/fish_02', 'augmented_data/fish_03', 'augmented_data/fish_04', 'augmented_data/fish_05', 'augmented_data/fish_06', 'augmented_data/fish_07', 'augmented_data/fish_08', 'augmented_data/fish_09', 'augmented_data/fish_10', 'augmented_data/fish_11', 'augmented_data/fish_12', 'augmented_data/fish_13', 'augmented_data/fish_14', 'augmented_data/fish_15', 'augmented_data/fish_16', 'augmented_data/fish_17', 'augmented_data/fish_18', 'augmented_data/fish_19', 'augmented_data/fish_21', 'augmented_data/fish_22', 'augmented_data/fish_23', 'augmented_data/fish_24', 'augmented_data/fish_25', 'augmented_data/fish_26']
['augmented_data/mask_01', 'augmented_data/mask_02', 'augmented_data/mask_03', 'augmented_data/mask_04', 'augmented_data/mask_05', 'augmented_data/mask_06', 'augmented_data/mask_07', 'augmented_data/mask_08', 'augmented_data/mask_09', 'augmented_data/mask_10', 'augmented_data/mask_11', 'augmented_data/mask_12', 'augmented_da




In [7]:
# Making sure the original data and sugmented data have the same subdirectories and species aligned
i = 0 
for image_folder in image_folders:
    image_folder = image_folder.split('/')[1:]
    image_folders[i] = 'data/' + image_folder[0]
    i +=1

i = 0
for mask_folder in mask_folders:
    mask_folder = mask_folder.split('/')[1:]
    mask_folders[i] = 'data/' + mask_folder[0]
    i +=1

print(image_folders)
print(mask_folders)

['data/fish_01', 'data/fish_02', 'data/fish_03', 'data/fish_04', 'data/fish_05', 'data/fish_06', 'data/fish_07', 'data/fish_08', 'data/fish_09', 'data/fish_10', 'data/fish_11', 'data/fish_12', 'data/fish_13', 'data/fish_14', 'data/fish_15', 'data/fish_16', 'data/fish_17', 'data/fish_18', 'data/fish_19', 'data/fish_21', 'data/fish_22', 'data/fish_23', 'data/fish_24', 'data/fish_25', 'data/fish_26']
['data/mask_01', 'data/mask_02', 'data/mask_03', 'data/mask_04', 'data/mask_05', 'data/mask_06', 'data/mask_07', 'data/mask_08', 'data/mask_09', 'data/mask_10', 'data/mask_11', 'data/mask_12', 'data/mask_13', 'data/mask_14', 'data/mask_15', 'data/mask_16', 'data/mask_17', 'data/mask_18', 'data/mask_19', 'data/mask_21', 'data/mask_22', 'data/mask_23', 'data/mask_24', 'data/mask_25', 'data/mask_26']


In [8]:
image_folder

['fish_26']

In [9]:
num = 0
# Iterating over the different folders (one for each species) that contain the imges and masks
for i in range(len(image_folders)):
    
    image_file_names = [f for f in listdir(image_folders[i]) if isfile(join(image_folders[i], f))]
    mask_file_names = [f for f in listdir(mask_folders[i]) if isfile(join(mask_folders[i], f))]
    
    for j in tqdm(range(len(image_file_names))):
        num+=1

num

100%|████████████████████████████████████████████████████████████████████████████| 422/422 [00:00<00:00, 418142.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 88/88 [00:00<00:00, 88768.34it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 72/72 [00:00<00:00, 71953.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 204/204 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 41983.02it/s]
100%|███████████████████████████████████

1303

In [10]:
# Accessing fish images and masks then saving them to an array called data
data = np.zeros((num,100,100,4), dtype = np.float16)
species = np.zeros((num,1), dtype = int)

# error_vector = np.zeros(24971)
num = 0
# Iterating over the different folders (one for each species) that contain the imges and masks
for i in range(len(image_folders)):
    
    image_file_names = [f for f in listdir(image_folders[i]) if isfile(join(image_folders[i], f))]
    mask_file_names = [f for f in listdir(mask_folders[i]) if isfile(join(mask_folders[i], f))]
    
    for j in tqdm(range(len(image_file_names))):
        
        img_file_name = image_folders[i] + "/" + image_file_names[j]
        mask_file_name = mask_folders[i] + "/" + mask_file_names[j]

        img = cv.imread(img_file_name)
        
        mask_org = cv.imread(mask_file_name)
        mask_org_gray = cv.cvtColor(mask_org, cv.COLOR_BGR2GRAY)

        mask_org_gray_resized = cv.resize(mask_org_gray, (100,100)) / 255.0
        img_resized = cv.resize(img, (100,100)) / 255.0
        
        data[num, :, :, 0:3] = img_resized
        data[num, :, :, 3] = mask_org_gray_resized
        
        species[num,0] = i+1
        num +=1
        if img.any() == None:
            print("error")
        
        if mask_org.any() == None:
            print("error")
        

100%|████████████████████████████████████████████████████████████████████████████████| 422/422 [00:10<00:00, 41.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 128/128 [00:02<00:00, 44.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 88/88 [00:02<00:00, 38.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [00:01<00:00, 39.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 74/74 [00:01<00:00, 44.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 204/204 [00:04<00:00, 47.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [00:02<00:00, 17.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:01<00:00, 35.52it/s]
100%|███████████████████████████████████

# Confirming the number of images per class

In [11]:
unique_elements, counts_elements = np.unique(species, return_counts=True)
print("Frequency of unique values of the species:")
print(np.asarray((unique_elements, counts_elements)))

# np.unique(species_aug)

Frequency of unique values of the species:
[[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
   19  20  21  22  23  24  25]
 [422 128  88  72  74 204  41  42   2   3  15  24  74   2   7   3   2   5
    4   6  14   3  25  31  12]]


# Saving The images to the npy file

In [12]:
np.save('data_nature.npy', data)
np.save('species_nature.npy', species)