**Introduction**

* In this notebook, we are going to make use of the K-Nearest Neighbours classifier on our dataset titled "The Nature Conservancy Fisheries Monitoring" that is available on Kaggle

**Importing the required libraries**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

**Importing the ML and Deep Learning libraries**

In [None]:
import cv2
import os
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

**Defining a function to extract raw pixels from images**

In [None]:
# Extracts raw pixel array from images
def get_image_vector(image,size=(64,64)):
    return cv2.resize(image,size).flatten()

**Extracting the color histogram from images**

In [None]:
# Extracts the color histogram from images
def extract_color_histogram(image,bins = (8,8,8)):
    hsv = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,bins,[0,256,0,256,0,256])
    cv2.normalize(hist,hist)
    return hist.flatten()

**Checking the working directory**

In [None]:
pwd

'C:\\Users\\prash\\Desktop\\FinalProject'

**Setting the correct train and test directories to read the images**

In [None]:
train_dir = "train\\train\\"
test_dir = "test_stg1\\test_stg1\\"

**Obtaining the class names from the training directory containing the training set images**

In [None]:
classes = os.listdir(train_dir)

**Displaying the class names**

In [None]:
classes

['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']

**Defining a for loop to obtain the labels associated with each of the training images in our dataset**

In [None]:
# Get the labels

labels = []
for c in classes:
    l = [c]*len(os.listdir(train_dir+c+'\\'))
    labels.extend(l)

**Displaying the corresponding labels**

In [None]:
labels

['ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',
 'ALB',


**Printing the length of the labels variable**

In [None]:
len(labels)

3777

**Defining a for loop to read the training image paths**

In [None]:
# Get the training data paths

image_path_list = []
for c in classes:
    fish_images = [train_dir+c+'\\'+item for item in os.listdir(train_dir+c+'\\')]
    image_path_list.extend(fish_images)

**Displaying the image path list**

In [None]:
image_path_list

['train\\train\\ALB\\img_00003.jpg',
 'train\\train\\ALB\\img_00010.jpg',
 'train\\train\\ALB\\img_00012.jpg',
 'train\\train\\ALB\\img_00015.jpg',
 'train\\train\\ALB\\img_00019.jpg',
 'train\\train\\ALB\\img_00020.jpg',
 'train\\train\\ALB\\img_00029.jpg',
 'train\\train\\ALB\\img_00032.jpg',
 'train\\train\\ALB\\img_00037.jpg',
 'train\\train\\ALB\\img_00038.jpg',
 'train\\train\\ALB\\img_00039.jpg',
 'train\\train\\ALB\\img_00041.jpg',
 'train\\train\\ALB\\img_00043.jpg',
 'train\\train\\ALB\\img_00045.jpg',
 'train\\train\\ALB\\img_00055.jpg',
 'train\\train\\ALB\\img_00057.jpg',
 'train\\train\\ALB\\img_00074.jpg',
 'train\\train\\ALB\\img_00085.jpg',
 'train\\train\\ALB\\img_00090.jpg',
 'train\\train\\ALB\\img_00097.jpg',
 'train\\train\\ALB\\img_00110.jpg',
 'train\\train\\ALB\\img_00121.jpg',
 'train\\train\\ALB\\img_00130.jpg',
 'train\\train\\ALB\\img_00134.jpg',
 'train\\train\\ALB\\img_00136.jpg',
 'train\\train\\ALB\\img_00154.jpg',
 'train\\train\\ALB\\img_00156.jpg',
 

**We then perform the encoding of the image labels by calling the LabelEncoder() function and calling the fit_transform method on our labels**

In [None]:
# Encode the labels

labels = LabelEncoder().fit_transform(labels)

**We then create a features variable comprising of the color histogram from images**

In [None]:
# Get the color histograms from the images

features = []
for i,image_path in enumerate(image_path_list):
    image = cv2.imread(image_path)
    hist = extract_color_histogram(image)
    features.append(hist)
    if(i%1000==0):
        print(str(i)+ "  completed")

0  completed
1000  completed
2000  completed
3000  completed


**Splitting our features into train and test set features, labels into train and test set labels, with a test size sample of 0.25 and a random state value set to 42**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(features,labels,test_size = 0.25, random_state = 42)

**Calling the KNN classifier with number of neighbours as 2 as it gave us the best result after training our model for different values of K**

In [None]:
model = KNeighborsClassifier(n_neighbors = 2, n_jobs = -1)

**Fitting our KNN model to the train data**

In [None]:
model.fit(X_train,y_train)

KNeighborsClassifier(n_jobs=-1, n_neighbors=2)

**Calculating the prediction accuracy of our model**

In [None]:
accuracy = model.score(X_test,y_test)

**Displaying the accuracy**

In [None]:
print(accuracy)

0.9259259259259259


**Predicting the probability of the classification output on the test set data**

In [None]:
preds_validation = model.predict_proba(X_test)

**Importing the log_loss function from sklearn.metrics**

In [None]:
from sklearn.metrics import log_loss

**Calculating the log loss value**

In [None]:
log_loss(y_test,preds_validation)

1.510369068479778

**The Log Loss metric takes into account the probabilities underlying your models, and not only the final output of the classification. The bolder the probabilities, the better will be your Log Loss — closer to zero. It is a measure of uncertainty , so a low Log Loss means a low uncertainty/entropy of your model.**

**Reading the test set images from the image directory**

In [None]:
test_files = [im for im in os.listdir(test_dir)]

**Displaying the first entry in the test set image**

In [None]:
test_dir +'\\'+ test_files[0]

'test_stg1\\test_stg1\\\\img_00005.jpg'

**Reading 1000 test set images**

In [None]:
test_features = []
for i,image_path in enumerate(test_files):
    image = cv2.imread(test_dir + '\\'+ image_path)
    hist = extract_color_histogram(image)
    test_features.append(hist)
    if(i%1000==0):
        print(str(i)+ "  completed")

0  completed


**Calculating the prediction probability on the test set images and displaying it**

In [None]:
preds = model.predict_proba(test_features)
preds

array([[0. , 0. , 0. , ..., 0. , 0. , 1. ],
       [1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [1. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0.5, 0. , 0. , ..., 0.5, 0. , 0. ]])

**You can use the below code for creating a submission dataframe to upload it as a csv file for the Kaggle competition**

In [None]:
submission1 = pd.DataFrame(preds, columns= os.listdir("train\\train\\"))
submission1.insert(0, 'image', test_files)
submission1.head()

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,img_00005.jpg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,img_00007.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,img_00009.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,img_00018.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,img_00027.jpg,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5


In [None]:
clipped_preds = np.clip(preds,(1-0.82)/7,0.82)

submission2 = pd.DataFrame(clipped_preds, columns= os.listdir("train\\train\\"))
submission2.insert(0, 'image', test_files)
submission2.head()

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,img_00005.jpg,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.82
1,img_00007.jpg,0.82,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714
2,img_00009.jpg,0.82,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714
3,img_00018.jpg,0.82,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714
4,img_00027.jpg,0.5,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.5


In [None]:
submission2.to_csv("K_neighbors_submission.csv",index = False)

**Conclusion**

* We have made use of the KNN classifier to classify our fish images. We evaluated it using the test set images and displayed the prediction probabilites for it.

**License**

MIT License

Copyright (c) 2020 [ Prasham Shah, Priyanka Bandekar ]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.