In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import multiprocessing
from multiprocessing.pool import ThreadPool

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,accuracy_score
from skimage.feature import hog
from skimage import color
from sklearn.cluster import KMeans
import cv2

import os


In [None]:
dataPath = 'all'
localPath = ''

images = np.load(dataPath + '/train_images.npy', encoding="bytes")
labels = pd.read_csv(dataPath + '/train_labels.csv')

In [None]:
allData = pd.DataFrame(np.array(list(images[:,1]))).assign(label=labels['Category'])
num_classes = len(labels['Category'].unique())
valueCounts = labels['Category'].value_counts()


xTrain, xValid = train_test_split(allData, stratify=labels['Category'], test_size=0.15)
trainInds = xTrain.index
validInds = xValid.index
xTrainRaw = xTrain.drop('label', axis=1).values.reshape((xTrain.shape[0], 100, 100, 1))
xValidRaw = xValid.drop('label', axis=1).values.reshape((xValid.shape[0], 100, 100, 1))

yTrain = labels.iloc[trainInds, 1].values
yValid = labels.iloc[validInds, 1].values

In [None]:
lb = LabelEncoder()
lb.fit(labels['Category'].unique())
yTrainFinal = lb.transform(yTrain)
yValidFinal = lb.transform(yValid)

In [None]:
def preProcessImage(image, cutoff=127, areaCutoff=14, maxContours=4, fliplr=False):
    image = np.uint8(image)
    im = np.uint8(image)
    red, thresh = cv2.threshold(im, cutoff, 255, 0)
    im2, contours, hierarchy= cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    mask = np.zeros(im.shape, np.uint8)
    largest_contours = sorted(contours, key=cv2.contourArea, reverse=True)
    
    for ind, contour in enumerate(largest_contours[:maxContours]):
        if cv2.contourArea(contour) > areaCutoff:
            x, y, w, h = cv2.boundingRect(contour)
            mask[y:y+h, x:x+w] = 255
        
    filteredImage = cv2.bitwise_and(image, image, mask=mask)
    if fliplr:
        return np.fliplr(filteredImage).reshape((image.shape))
    return filteredImage.reshape((image.shape))

In [None]:
pool = ThreadPool(multiprocessing.cpu_count())
xTrainUnflipped = pool.map(lambda im: preProcessImage(im).flatten(), [xTrainRaw[i] for i in range(xTrainRaw.shape[0])])
xTrainUnflipped = np.array(xTrainUnflipped)

xValidUnflipped = pool.map(lambda im: preProcessImage(im).flatten(), [xValidRaw[i] for i in range(xValidRaw.shape[0])])
xValidUnflipped = np.array(xValidUnflipped)

pool.close()
pool.join()

**SIFT feature extraction and constructing BoVW (bag of visual words)**

In [None]:
extractor = cv2.xfeatures2d.SIFT_create();
def features(image, extractor):
    keypoints, descriptors = extractor.detectAndCompute(image, None)
    return keypoints, descriptors
dict_size = 8000;

# !pip install opencv-python==3.4.2.16    #for installation on google colab
# !pip install opencv-contrib-python==3.4.2.16

In [None]:
descriptor_list = np.array([]);
desc_src_img = [];
      
for i in range(0, len(xTrainUnflipped)):
  im1 = xTrainUnflipped[i];
  im1 = im1.reshape(100,100);
  kp, dp = features(im1, extractor);
  if dp is not None:
    if len(descriptor_list) == 0 :
      descriptor_list = np.array(dp);
    else:
      descriptor_list = np.vstack((descriptor_list, dp))
    for j in range(len(dp)):
      desc_src_img.append(i);
        
descriptor_list = np.float32(descriptor_list)


In [None]:
#Array to hold all images, and later update pixels depending on which class they fall in after k-clustering
imgs_data = [];
for i in range(0, 8500):
  imgs_data.insert(i, np.zeros((dict_size,1)))

In [None]:
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)

flags = cv2.KMEANS_RANDOM_CENTERS

compactness,labels,centers = cv2.kmeans(descriptor_list, dict_size, None, criteria, 1, flags)

In [None]:
for i in range(0, len(labels)):
  img_id = desc_src_img[i];
  imgs_data[img_id][labels[i]] += 1;


In [None]:
xTrain = []
for i in range(0, 8500):
  xTrain.append(imgs_data[i])

xTrain = np.asarray(xTrain);
xTrain = xTrain.reshape(8500,dict_size);

In [None]:
descriptor_list = np.array([]);
desc_src_img = [];
      
for i in range(0, len(xValidUnflipped)):
  im1 = xValidUnflipped[i];
  im1 = im1.reshape(100,100);
  kp, dp = features(im1, extractor);
  if dp is not None:
    if len(descriptor_list) == 0 :
      descriptor_list = np.array(dp);
    else:
      descriptor_list = np.vstack((descriptor_list, dp))
    for j in range(len(dp)):
      desc_src_img.append(i);
        
descriptor_list = np.float32(descriptor_list)


In [None]:
imgs_data = [];
for i in range(0, 1500):
  imgs_data.insert(i, np.zeros((dict_size,1)));

In [None]:
criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)

flags = cv2.KMEANS_RANDOM_CENTERS

compactness,labels,centers = cv2.kmeans(descriptor_list, dict_size, None, criteria, 1, flags)

In [None]:
for i in range(0, len(labels)):
  img_id = desc_src_img[i];
  imgs_data[img_id][labels[i]] += 1;

xValid = []
for i in range(0, 1500):
  xValid.append(imgs_data[i])

xValid = np.asarray(xValid);
xValid = xValid.reshape(1500,dict_size);

**Simple Classifier**

In [None]:
clf = SVC(gamma=.001)
clf.fit(xTrain, yTrainFinal)

y_pred = clf.predict(xValid)

print("Accuracy: "+str(accuracy_score(yValidFinal, y_pred)))
print('\n')
print(classification_report(yValidFinal, y_pred))

**Adaptive Boosting**

In [None]:
bdt = AdaBoostClassifier(SVC(gamma=.01, decision_function_shape='ovo'),
                         algorithm="SAMME",
                         n_estimators=150)

In [None]:
bdt.fit(xTrain, yTrainFinal)

In [None]:
y_pred = bdt.predict(xValid)
print("Accuracy: "+str(accuracy_score(yValidFinal, y_pred)))
print('\n')
print(classification_report(yValidFinal, y_pred))