# Creating The Augmented Dataset (Saving it to NPY Files)
This script will create the dataset that will contain all images (original and augmented) and save them to an .npy file. This file is the one that will be used in the script that train the classification model.

In [None]:
# General, System & support
from os import listdir
from os.path import isfile, join, exists
from tqdm import tqdm

# Computational
import numpy as np
import cv2 as cv
from matplotlib import pyplot as plt

from skimage import filters
from skimage import data, exposure, img_as_float

In [None]:
# Creating a list of the directories of the images and masks corresponding to the different species

data_path = 'augmented_data/'
img_folder_name = 'fish_'
msk_folder_name = 'mask_'

image_folders = []
mask_folders = []

for i in tqdm(range(26)):

    if i < 9:
      img_name = data_path + img_folder_name + '0' + str(i+1)
      msk_name = data_path + msk_folder_name + '0' + str(i+1)

    else:
      img_name = data_path + img_folder_name  + str(i+1)
      msk_name = data_path + msk_folder_name + str(i+1)

    # print(img_name)
    # print(msk_name)
    
    if exists(img_name):
        image_folders.append(img_name)
        mask_folders.append(msk_name)
    else:
        print(img_name)

print(image_folders)
print(mask_folders)

In [None]:
num = 0
# Iterating over the different folders (one for each species) that contain the imges and masks
for i in range(len(image_folders)):
    
    image_file_names = [f for f in listdir(image_folders[i]) if isfile(join(image_folders[i], f))]
    mask_file_names = [f for f in listdir(mask_folders[i]) if isfile(join(mask_folders[i], f))]
    
    for j in tqdm(range(len(image_file_names))):
        num+=1

num

In [None]:
# Accessing fish images and masks then saving them to an array called data
data_aug = np.zeros((num,100,100,4), dtype = np.float16)
species_aug = np.zeros((num,1), dtype = int)

# error_vector = np.zeros(24971)
num = 0
# Iterating over the different folders (one for each species) that contain the imges and masks
for i in range(len(image_folders)):
    
    image_file_names = [f for f in listdir(image_folders[i]) if isfile(join(image_folders[i], f))]
    mask_file_names = [f for f in listdir(mask_folders[i]) if isfile(join(mask_folders[i], f))]
    
    for j in tqdm(range(len(image_file_names))):
        
        img_file_name = image_folders[i] + "/" + image_file_names[j]
        mask_file_name = mask_folders[i] + "/" + mask_file_names[j]

        img = cv.imread(img_file_name)
        
        mask_org = cv.imread(mask_file_name)
        mask_org_gray = cv.cvtColor(mask_org, cv.COLOR_BGR2GRAY)

        mask_org_gray_resized = cv.resize(mask_org_gray, (100,100)) / 255.0
        img_resized = cv.resize(img, (100,100)) / 255.0
        
        data_aug[num, :, :, 0:3] = img_resized
        data_aug[num, :, :, 3] = mask_org_gray_resized
        
        species_aug[num,0] = i+1
        num +=1
        if img.any() == None:
            print("error")
        
        if mask_org.any() == None:
            print("error")
        

# Confirming the number of images per class

In [None]:
plt.hist(species_aug, bins='auto')
plt.show()

In [None]:
unique_elements, counts_elements = np.unique(species_aug, return_counts=True)
print("Frequency of unique values of the species:")
print(np.asarray((unique_elements, counts_elements)))

# np.unique(species_aug)

# Saving The images to the npy file

In [None]:
np.save('data_aug_nature.npy', data_aug)
np.save('species_aug_nature.npy', species_aug)