In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import multiprocessing
from multiprocessing.pool import ThreadPool

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn import datasets, svm, metrics
from sklearn import decomposition

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import AdaBoostClassifier

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors.nearest_centroid import NearestCentroid

from sklearn import preprocessing

from cv2 import ml

import cv2

import os
import io


In [None]:
## Set local data path
dataPath = 'all'
localPath = ''
print(os.listdir(dataPath))

images = np.load(dataPath + '/train_images.npy', encoding="bytes")
labels = pd.read_csv(dataPath + '/train_labels.csv')
images.shape


In [None]:
images


In [None]:
labels

In [None]:
def preProcessImageObsolete(image, cutoff=127, maxContours=5):
    image = np.uint8(image)
    im = np.uint8(image)
    red, thresh = cv2.threshold(im, cutoff, 255, 0)
    im2, contours, hierarchy= cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    mask = np.uint8(np.ones(im.shape))
    largest_contours = sorted(contours, key=cv2.contourArea)

    for ind, contour in enumerate(largest_contours[maxContours:]):
        mask = cv2.drawContours(mask, [largest_contours[ind]], -1, 0, -1)
        
    filteredImage = cv2.bitwise_and(thresh, thresh, mask=mask)
    ## plt.imshow(filteredImage)
    return filteredImage
    

In [None]:
def preProcessImage(image, cutoff=127, areaCutoff=14, maxContours=4, fliplr=False):
    image = np.uint8(image)
    im = np.uint8(image)
    red, thresh = cv2.threshold(im, cutoff, 255, 0)
    im2, contours, hierarchy= cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    mask = np.zeros(im.shape, np.uint8)
    largest_contours = sorted(contours, key=cv2.contourArea, reverse=True)
    
    for ind, contour in enumerate(largest_contours[:maxContours]):
        if cv2.contourArea(contour) > areaCutoff:
            x, y, w, h = cv2.boundingRect(contour)
            mask[y:y+h, x:x+w] = 255
        
    filteredImage = cv2.bitwise_and(image, image, mask=mask)
    #plt.imshow(filteredImage)
    #plt.figure()
    #plt.imshow(thresh)
    #plt.figure()
    #plt.imshow(mask)
    if fliplr:
        return np.fliplr(filteredImage).reshape((image.shape))
    return filteredImage.reshape((image.shape))

In [None]:
pd.Series(labels.groupby(['Category']).size()).sort_values(ascending=True)

In [None]:
num = 90
image = images[num][1].reshape(100, 100)
betterImage = preProcessImage(np.fliplr(image), maxContours=1)
plt.imshow(betterImage)
plt.figure()
plt.imshow(image)

In [None]:
allData = pd.DataFrame(np.array(list(images[:,1]))).assign(label=labels['Category'])
num_classes = len(labels['Category'].unique())
lb = LabelBinarizer()
lb.fit(labels['Category'].unique())
valueCounts = labels['Category'].value_counts()

xTrain, xValid = train_test_split(allData, stratify=labels['Category'], test_size=0.05)
trainInds = xTrain.index
validInds = xValid.index
xTrainRaw = xTrain.drop('label', axis=1).values.reshape((xTrain.shape[0], 100, 100, 1))
xValidRaw = xValid.drop('label', axis=1).values.reshape((xValid.shape[0], 100, 100, 1))

yTrainString = labels.iloc[trainInds, 1].values
yValidString = labels.iloc[validInds, 1].values

yTrain = lb.transform(yTrainString)
yValid = lb.transform(yValidString)

In [None]:
epochs = 500
batch_size = 128

In [None]:
num_classes = len(labels['Category'].unique())

valueCounts = labels['Category'].value_counts()

In [None]:
xTrain, xValid = train_test_split(allData, stratify=labels['Category'], test_size=0.15)
trainInds = xTrain.index
validInds = xValid.index
xTrainRaw = xTrain.drop('label', axis=1).values.reshape((xTrain.shape[0], 100, 100, 1))
xValidRaw = xValid.drop('label', axis=1).values.reshape((xValid.shape[0], 100, 100, 1))

In [None]:
yTrain = labels.iloc[trainInds, 1].values
yValid = labels.iloc[validInds, 1].values

In [None]:
pool = ThreadPool(multiprocessing.cpu_count())
xTrainUnflipped = pool.map(lambda im: preProcessImage(im).flatten(), [xTrainRaw[i] for i in range(xTrainRaw.shape[0])])
xTrainUnflipped = np.array(xTrainUnflipped)
xTrainFlipped = pool.map(lambda im: preProcessImage(im, fliplr=True).flatten(), [xTrainRaw[i] for i in range(xTrainRaw.shape[0])])
xTrainFlipped = np.array(xTrainFlipped)

xValidUnflipped = pool.map(lambda im: preProcessImage(im).flatten(), [xValidRaw[i] for i in range(xValidRaw.shape[0])])
xValidUnflipped = np.array(xValidUnflipped)
xValidFlipped = pool.map(lambda im: preProcessImage(im, fliplr=True).flatten(), [xValidRaw[i] for i in range(xValidRaw.shape[0])])
xValidFlipped = np.array(xValidFlipped)

xTrain = np.concatenate((xTrainUnflipped, xTrainFlipped), axis=0)
xValid = np.concatenate((xValidUnflipped, xValidFlipped), axis=0)

yTrain = np.concatenate((yTrain, yTrain), axis=0)
yValid = np.concatenate((yValid, yValid), axis=0)


pool.close()
pool.join()

In [None]:
"""#ONE EX TRAIN
num = 1011
plt.imshow(xTrain[num])
plt.figure()
plt.imshow(xTrainRaw[num])
yTrainString[num]
"""

In [None]:
"""
def fd_hu_moments(image):
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature
clf = SVC(gamma=0.001, verbose=True)
print("Fitting")

clf.fit(xTrain, yTrain)
yPred = clf.predict(xValid)
"""

In [None]:
labels

In [None]:
le = preprocessing.LabelEncoder()
le.fit(labels['Category'].unique())
yTrainInt = le.transform(yTrain)
yValidInt = le.transform(yValid)
print(yTrainInt)

In [None]:
#OVERNIGHT

In [None]:
bestModels = {}
bestAccu = {}
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-5, 5*1e-4, 1e-4, 5*1e-3, 1e-3, 5*1e-2, 0.01 ,0.05 , 0.1 , 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 
                                                  2.0, 2.5, 3.0 , 3.5, 4.0, 6.0, 7.5, 9.0, 11.0, 14.0, 17.0, 25.0, 30.0, 40.0, 50.0, 75.0, 100.0, 150.0, 200.0, 250.0, 330.0, 400.0, 500.0, 600.0, 700.0, 800.0, 1000.0], 
                     'C': [1e-5, 5*1e-4, 1e-4, 5*1e-3, 1e-3, 5*1e-2, 0.01 ,0.05 , 0.1 , 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 
                           2.0, 2.5, 3.0 , 3.5, 4.0, 6.0, 7.5, 9.0, 11.0, 14.0, 17.0, 25.0, 30.0, 40.0, 50.0, 75.0, 100.0, 150.0, 200.0, 250.0, 330.0, 400.0, 500.0, 600.0, 700.0, 800.0, 1000.0]},
                    {'kernel': ['linear'], 'C': [1e-5, 5*1e-4, 1e-4, 5*1e-3, 1e-3, 5*1e-2, 0.01 ,0.05 , 0.1 , 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 
                                                 2.0, 2.5, 3.0 , 3.5, 4.0, 6.0, 7.5, 9.0, 11.0, 14.0, 17.0, 25.0, 30.0, 40.0, 50.0, 75.0, 100.0, 150.0, 200.0, 250.0, 330.0, 400.0, 500.0, 600.0, 700.0, 800.0, 1000.0]}]
tuned_params = {'kernel': ['rbf'], 'gamma': [1e-5, 5*1e-4, 1e-4, 5*1e-3, 1e-3, 5*1e-2, 0.01 ,0.05 , 0.1 , 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 
                                                  2.0, 2.5, 3.0 , 3.5, 4.0, 6.0, 7.5, 9.0, 11.0, 14.0, 17.0, 25.0, 30.0, 40.0, 50.0, 75.0, 100.0, 150.0, 200.0, 250.0, 330.0, 400.0, 500.0, 600.0, 700.0, 800.0, 1000.0], 
                     'C': [1e-5, 5*1e-4, 1e-4, 5*1e-3, 1e-3, 5*1e-2, 0.01 ,0.05 , 0.1 , 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 
                           2.0, 2.5, 3.0 , 3.5, 4.0, 6.0, 7.5, 9.0, 11.0, 14.0, 17.0, 25.0, 30.0, 40.0, 50.0, 75.0, 100.0, 150.0, 200.0, 250.0, 330.0, 400.0, 500.0, 600.0, 700.0, 800.0, 1000.0]}
my_cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for numOfDim in [30, 50, 100, 250, 500, 1000]:
  print(numOfDim)
  pca = decomposition.PCA(n_components=numOfDim)
  pca.fit(xTrain)
  xPCA = pca.transform(xTrain)
  clf = RandomizedSearchCV(SVC(), param_distributions=tuned_params, n_iter=100, cv=my_cv, verbose=2)
  #clf = GridSearchCV(SVC(), param_grid=tuned_parameters, cv=my_cv, verbose=2)
  clf.fit(xPCA, yTrainInt)
  bp = clf.best_params_
  bestModels[numOfDim] = bp
  
  print(clf.best_score_)
  
  xValidPCA = pca.transform(xValid)
  y_true, y_pred = yValidInt, clf.predict(xValidPCA)
  
  scoreLogLoss = log_loss(y_true, y_pred)
  bestAccu[numOfDim] = scoreLogLoss
  
  unique, counts = np.unique(y_pred, return_counts=True)
  print(dict(zip(unique, counts)))
  
  print("Classification report for classifier %s:\n%s\n"
      % ("svm", metrics.classification_report(y_true, y_pred)))

In [None]:
numOfDim=30
pca = decomposition.PCA(n_components=numOfDim)
pca.fit(xTrain)
xPCA = pca.transform(xTrain)
bdt = AdaBoostClassifier(bestModels[numOfDim], n_estimators = 20)

In [None]:
print(bestAccu[numOfDim])

In [None]:
bdt.fit(xTrain, yTrainInt)

In [None]:
xValidPCA = pca.transform(xValid)
y_true = yValidInt
y_pred = bdt.predict(xValidPCA)
print(log_loss(y_true, y_pred))

unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))
  
print("Classification report for classifier %s:\n%s\n"
    % ("svm", metrics.classification_report(y_true, y_pred)))

In [None]:

pca = decomposition.PCA(n_components=1000)
pca.fit(xTrain)
xPCA = pca.transform(xTrain)

In [None]:
print(xPCA)

In [None]:
class StatModel(object):
    '''parent class - starting point to add abstraction'''    
    def load(self, fn):
        self.model.load(fn)
    def save(self, fn):
        self.model.save(fn)
        
class SVM(StatModel):
    '''wrapper for OpenCV SimpleVectorMachine algorithm'''
    def __init__(self):
        self.model = cv2.ml.SVM_create()

    def train(self, samples, responses):
        #setting algorithm parameters
        params = dict( kernel_type = cv2.ml.SVM_RBF,
                    svm_type = cv2.ml.SVM_C_SVC,
                    C=80, gamma=53.83 )
        self.model.train(samples, responses, params = params)

    def predict(self, samples):
        return np.float32( [self.model.predict(s) for s in samples])

In [None]:
# Create a classifier: a support vector classifier
#classifier = svm.SVC(gamma=0.001)
svm = cv2.ml.SVM_create()
#fit to the trainin data
svm.train(np.float32(xPCA),cv2.ml.ROW_SAMPLE,yTrainInt[:,np.newaxis])

In [None]:
xValidPCA = pca.transform(xValid)

predVal = svm.predict(np.float32(xValidPCA))

In [None]:
unique, counts = np.unique(predVal[1], return_counts=True)
dict(zip(unique, counts))

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % ("svm", metrics.classification_report(yValidInt, predVal[1])))

In [None]:
predVal[1]

In [None]:
xTrain.shape

In [None]:
responses = np.float32(np.repeat(np.arange(10),250)[:,np.newaxis])
yTrainInt[:,np.newaxis]

In [None]:
print(yValidInt)

In [None]:
print(list(predVal[1]))

In [None]:
pcaNN = decomposition.PCA(n_components=800)
pcaNN.fit(xTrain)
xPCAnn = pcaNN.transform(xTrain)
xValidPCAnn = pcaNN.transform(xValid)
#myNN = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(xPCAnn)
myNN = NearestCentroid()
myNN.fit(xPCAnn, yTrainInt[:,np.newaxis])
predValNN = myNN.predict(np.float32(xValidPCAnn))

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % ("svm", metrics.classification_report(yValidInt, predValNN)))

In [None]:
unique, counts = np.unique(predValNN, return_counts=True)
dict(zip(unique, counts))