# Fish Classification Code
The code loads the data.npy and data_aug.npy files that contain the dataset and augmented dataset to train and test the model on. the files are in the subfolder '/data/'

This code should be run on a GPU (it was implemented in the datahub - py37-tf2 kernel).
It implements the original model proposed by Rathi et al. (https://ieeexplore.ieee.org/document/8593044). then it implements our improved model after that in the following section.

In [None]:
import tensorflow as tf 
import cv2 as cv 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import seaborn as sb

from tensorflow.keras import backend as K
from tensorflow.keras import datasets, layers, models

import os
import time
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Support Packages
from tqdm import tqdm
from os import listdir
from os.path import isfile, join

## Import data

In [None]:
# For The Data Hub
# Loading the dataset (both original and augmented) from the npy files
x_train_orig = np.load('data/data.npy')
x_train_aug = np.load('data/data_aug.npy')

labels_orig = np.load('data/species.npy')
labels_aug = np.load('data/species_aug.npy')

x_train = np.concatenate((x_train_orig, x_train_aug), axis=0)
labels = np.concatenate((labels_orig, labels_aug), axis=0)

labels = labels.astype(np.uint8)
labels = labels - 1
print(labels.dtype)

# to test the original dataset without augmentation (uncomment the following two lines): 
# x_train = x_train_orig 
# labels = labels_orig

# Deleting extra variables to free up memory
del x_train_orig
del x_train_aug

del labels_orig
del labels_aug

## Creating the training, validation, and testing datasets

In [None]:
#Splitting the data into training, validation, and testing.

X_train_init, X_test, Y_train_init, Y_test = train_test_split(x_train, labels, train_size=0.8, random_state=42, shuffle=True)
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train_init, Y_train_init, train_size=0.8, random_state=42, shuffle=True)

# Deleting extra variables to free up memory
del X_train_init
del Y_train_init

del labels
del x_train

# Reproducing the original CNN 

This code reproduces the code as listed in Rathi et al. (https://ieeexplore.ieee.org/document/8593044). The initial dataset comes from: http://groups.inf.ed.ac.uk/f4k/GROUNDTRUTH/RECOG/. To prepare the data for this code scripts 1, 2, and 3 must be implemented before running this code.

In [None]:
# The architecture of the model:

def fish_classification_nn():
    
    model = models.Sequential()

    model.add(layers.Conv2D(32, (5, 5), activation='relu', input_shape=(100, 100, 4), padding = 'same')) 
    model.add(layers.MaxPooling2D((5, 5), padding = 'same'))

    model.add(layers.Conv2D(64, (5, 5), activation='relu', padding = 'same'))
    model.add(layers.MaxPooling2D((5, 5), padding = 'same'))
    
    model.add(layers.Conv2D(32, (5, 5), activation='relu', padding = 'same'))
    model.add(layers.MaxPooling2D((5, 5), padding = 'same'))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(200, activation='relu'))
    model.add(layers.Dropout(0.2)) 
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dropout(0.2)) 
    model.add(layers.Dense(23, activation='softmax'))

    return model

# Generating the summary of the model
rathi_fish_class=fish_classification_nn()
rathi_fish_class.summary()

#Compile and train the model
rathi_fish_class.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

rathi_fish_class_history = rathi_fish_class.fit(X_train, Y_train, validation_data= (X_validation, Y_validation),
                                                epochs= 100 , batch_size= 10, shuffle=True)

# Plotting the accuracy and loss for the training and validation data (vs. EPOCH)
plt.figure(figsize=(15,7))
plt.title('The Accuracy Vs. Epoch No.', fontsize= 14)

plt.subplot(121)
plt.plot(rathi_fish_class_history.history['val_accuracy'])
plt.plot(rathi_fish_class_history.history['accuracy'])
plt.ylabel('Accuracy', fontsize= 14)
plt.xlabel('Epoch No.', fontsize= 14)
plt.title('The Accuracy Vs. Epoch No.', fontsize= 14)
plt.legend(('Validation Set Accuracy','Training Set Accuracy'), fontsize= 14, loc=4)

plt.subplot(122)
plt.plot(rathi_fish_class_history.history['val_loss'])
plt.plot(rathi_fish_class_history.history['loss'])
plt.ylabel('Loss', fontsize= 14)
plt.xlabel('Epoch No.', fontsize= 14)
plt.title('The Loss Vs. Epoch No.', fontsize= 14)
plt.legend(('Validation Set Loss','Training Set Loss'), fontsize= 14, loc=1)
plt.show()

## Evaluating the model on the testing dataset
We evaluate the model on the testing dataset as well as generate the confusion matrix (normalized) and classification report ( with precision, recall, f1-score, and support)

In [None]:
# Predicting the classes of the testing dataset
Y_pred = rathi_fish_class.predict_classes(X_test)

from sklearn.metrics import confusion_matrix, classification_report
con_mat = confusion_matrix(Y_test,Y_pred)

#normalize result from 0 to 1:
sum_of_classes = con_mat.sum(axis=1)
con_mat_norm = np.zeros((23,23))
for i in range(con_mat.shape[0]):
    
    if sum_of_classes[i] != 0:
        con_mat_norm[i,:] = np.around(con_mat.astype('float')[i,:] / sum_of_classes[i], decimals=2) 
# the above code rounds to the first 2 decimals and then divides by the sum of every col 

classes=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]

con_mat_df = pd.DataFrame(con_mat_norm, index = classes, columns = classes)
 
figure = plt.figure(figsize=(15,10))
sb.set(font_scale = 1.3)
sb.heatmap(con_mat_norm, annot=True, cmap='inferno', annot_kws={"size": 12},  square=True)
plt.tight_layout()
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()

## Test set accuracy
accuracy = 1 - np.count_nonzero((Y_test[:,0] - Y_pred))/len(Y_pred)
print("The Accuracy for Rathi's Model is: %.2f%%" %(accuracy*100))
print(classification_report(Y_test, Y_pred, digits=3))

# Improved model

This model attempts to improve upon that used in Rathi et al. (2018) by adding dropout layers at 40% between Conv2D and MaxPooling Layers and only implements 2 Conv2D layers instead of 3. This is mainly to combat overfitting and improve generalization and accuracy (as well as accuracy across all classes).

In [None]:
def fish_classification_nn():
    
    model = models.Sequential()

    model.add(layers.Conv2D(64, (5, 5), activation='relu', input_shape=(100, 100, 4)))
    model.add(layers.Dropout(0.4))
    model.add(layers.MaxPooling2D((5, 5), padding = 'same'))
    
    model.add(layers.Conv2D(32, (5, 5), activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.MaxPooling2D((5, 5), padding = 'same'))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(200, activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(100, activation='relu'))
    model.add(layers.Dropout(0.4))
    model.add(layers.Dense(23, activation='softmax'))

    return model


#Compile and train the model
fish_classification = fish_classification_nn()
fish_classification.summary()

fish_classification.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
fish_classification_history = fish_classification.fit(X_train, Y_train, validation_data=(X_validation, Y_validation),
                                                      epochs= 100 , batch_size= 50, shuffle=True)


# Plotting the accuracy and loss for the training and validation data (vs. EPOCH)
plt.figure(figsize=(15,7))
plt.title('The Accuracy Vs. Epoch No.', fontsize= 14)

plt.subplot(121)
plt.plot(fish_classification_history.history['val_accuracy'])
plt.plot(fish_classification_history.history['accuracy'])
plt.ylabel('Accuracy', fontsize= 14)
plt.xlabel('Epoch No.', fontsize= 14)
plt.title('The Accuracy Vs. Epoch No.', fontsize= 14)
plt.legend(('Validation Set Accuracy','Training Set Accuracy'), fontsize= 14, loc=4)

plt.subplot(122)
plt.plot(fish_classification_history.history['val_loss'])
plt.plot(fish_classification_history.history['loss'])
plt.ylabel('Loss', fontsize= 14)
plt.xlabel('Epoch No.', fontsize= 14)
plt.title('The Loss Vs. Epoch No.', fontsize= 14)
plt.legend(('Validation Set Loss','Training Set Loss'), fontsize= 14, loc=1)
plt.show()

## Evaluating the model on the testing dataset
We evaluate the model on the testing dataset as well as generate the confusion matrix (normalized) and classification report (with precision, recall, f1-score, and support)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
# Predicting the classes of the testing dataset
Y_pred = fish_classification.predict_classes(X_test)

# Confusion Matrix
con_mat = confusion_matrix(Y_test,Y_pred)

# normalize result from 0 to 1:
sum_of_classes = con_mat.sum(axis=1)
con_mat_norm = np.zeros((23,23))
for i in range(con_mat.shape[0]):
    
    if sum_of_classes[i] != 0:
        con_mat_norm[i,:] = np.around(con_mat.astype('float')[i,:] / sum_of_classes[i], decimals=2) 
# the above code rounds to the first 2 decimals and then divides by the sum of every col 

classes=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]

con_mat_df = pd.DataFrame(con_mat_norm, index = classes, columns = classes)
 
figure = plt.figure(figsize=(15,10))
sb.set(font_scale = 1.3)
sb.heatmap(con_mat_norm, annot=True, cmap='inferno', annot_kws={"size": 12},  square=True)
plt.tight_layout()
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()

## Test set accuracy
accuracy = 1 - np.count_nonzero((Y_test[:,0] - Y_pred))/len(Y_pred)
print("The Accuracy for the Improved Model is: %.2f%%" %(accuracy*100))
print(classification_report(Y_test, Y_pred, digits=3))