** K-NN Classifier algorithm simply relies on the distance between feature vectors, much like building an image search engine — only this time, we have the labels associated with each image so we can predict and return an actual category for the image. **

**Simply put, the k-NN algorithm classifies unknown data points by finding the most common class among the k-closest examples. Each data point in the k closest examples casts a vote and the category with the most votes wins! **

In [1]:
# import the necessary packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import cv2
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss



In [2]:
# resize the image to a fixed size, then flatten the image into
# a list of raw pixel intensities
def image_to_feature_vector(image, size=(64, 64)):
    return cv2.resize(image, size).flatten()

** The image_to_feature_vector  method is an extremely naive function that simply takes an input image  and resizes it to a fixed width and height (size), and then flattens the RGB pixel intensities into a single list of numbers. **

** This means that our input image  will be shrunk to 64 x 64 pixels, and given three channels for each Red, Green, and Blue component respectively, our output “feature vector” will be a list of 64 x 64 x 3 = 12,288 numbers. **

In [3]:
# Extracts the color histogram from images
def extract_color_histogram(image,bins = (8,8,8)):
    hsv = cv2.cvtColor(image,cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,bins,[0,256,0,256,0,256])
    cv2.normalize(hist,hist)
    return hist.flatten()

In [4]:
train_dir = "fish/train/"
test_dir = "fish/test_stg1/"

In [5]:
classes = os.listdir(train_dir)
classes

['BET', 'YFT', 'SHARK', 'LAG', 'NoF', '.DS_Store', 'DOL', 'ALB', 'OTHER']

In [6]:
del classes[5]
classes = sorted(classes)
classes

['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']

In [7]:
# Get the training data paths
image_paths = []
for species in classes:
    image_paths.extend(train_dir + species + '/' + img for img in os.listdir(train_dir + species + '/'))
print('Images in Training Dataset:',len(image_paths))

Images in Training Dataset: 3777


In [8]:
# Get the labels
labels = []
for species in classes:
    l = [species]*len(os.listdir(train_dir + species + '/'))
    labels.extend(l)

In [9]:
# Encode the labels
labels = LabelEncoder().fit_transform(labels)

In [10]:
# Extracting the color histograms from the images
features = []
for i,image_path in enumerate(image_paths):
    image = cv2.imread(image_path)
    hist = extract_color_histogram(image)
    features.append(hist)
    if i == (len(image_paths)-1):
        print(str(i+1)+ "  completed")
    elif(i%1000==0):
        print(str(i)+ "  completed")

0  completed
1000  completed
2000  completed
3000  completed
3777  completed


### Partition the data into training and testing splits, using 80% of the data for training and the remaining 20% for testing.

In [11]:
X_train,X_test,y_train,y_test = train_test_split(features,labels,test_size = 0.20, random_state = 36)

In [67]:
'''log_loss_scorer = make_scorer(log_loss, greater_is_better = False, needs_proba = True)

In [68]:
'''grid_search = GridSearchCV(KNeighborsClassifier(), param_grid = {"n_neighbors":[1,2,3,4,5]}, cv = 5)

In [69]:
'''grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [70]:
'''grid_search.score(X_test,y_test)

0.9484126984126984

In [71]:
'''print(grid_search.best_score_)

0.9347898047004303


In [72]:
'''print(grid_search.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')


In [73]:
'''print(grid_search.best_params_)

{'n_neighbors': 1}


In [19]:
# train and evaluate a k-NN classifer on the raw pixel intensities

print("[INFO] evaluating histogram accuracy...")
model = KNeighborsClassifier(n_neighbors = 3, n_jobs = -1)
model.fit(X_train,y_train)

[INFO] evaluating histogram accuracy...


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='uniform')

In [20]:
accuracy = model.score(X_test,y_test)
print("[INFO] raw pixel accuracy: {:.2f}%".format(accuracy * 100))

[INFO] raw pixel accuracy: 90.87%


In [21]:
prediction = model.predict_proba(X_test)

In [22]:
from sklearn.metrics import log_loss
log_loss(y_test, prediction)

1.4961649870166667

In [23]:
test_image_name = []
test_image_name.extend(img for img in os.listdir(test_dir))

test_image_paths=[]
test_image_paths.extend(test_dir + img for img in os.listdir(test_dir))
print('Images in Test Dataset:',len(test_image_paths))

Images in Test Dataset: 1000


In [24]:
# Get the color histograms from the images
test_features = []
for i,image_path in enumerate(test_image_paths):
    image = cv2.imread(image_path)
    hist = extract_color_histogram(image)
    test_features.append(hist)
    if len(test_features) == 1000:
        print(str(i+1)+ "  completed")

1000  completed


In [25]:
prediction = model.predict_proba(test_features)

In [None]:
#predictions = grid_search.predict_proba(test_features)

In [26]:
submission1 = pd.DataFrame(prediction, columns= classes)
submission1.insert(0, 'image', test_image_paths)
submission1.head()

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,fish/test_stg1/img_00943.jpg,0.333333,0.0,0.0,0.0,0.666667,0.0,0.0,0.0
1,fish/test_stg1/img_05979.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,fish/test_stg1/img_03312.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,fish/test_stg1/img_05927.jpg,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,fish/test_stg1/img_05958.jpg,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [28]:
clipped_predictions = np.clip(prediction,(1-0.82)/7,0.82)

submission2 = pd.DataFrame(clipped_predictions, columns= classes)
submission2.insert(0, 'image', test_image_name)
submission2.head()

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,img_00943.jpg,0.333333,0.025714,0.025714,0.025714,0.666667,0.025714,0.025714,0.025714
1,img_05979.jpg,0.82,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714
2,img_03312.jpg,0.82,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714
3,img_05927.jpg,0.82,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714,0.025714
4,img_05958.jpg,0.025714,0.025714,0.025714,0.025714,0.025714,0.82,0.025714,0.025714


In [29]:
submission2.to_csv("K_neighbors_submission.csv",index = False)