In [1]:
import numpy as np 
import sklearn as sk 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from numpy import genfromtxt
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA, KernelPCA
from skimage.feature import hog

def readData(filename, size):
    my_data = genfromtxt(filename, delimiter=',')
    dim = my_data.shape[1]
    X, Y = my_data[:, 1:901], my_data[:, 0]
    
    # Feature selection: remove features with low variance (Boolean variables --> Bernoulli variables)
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    X = sel.fit_transform(X)
    
    # Feature extraction 
    # PCA
    #pca = PCA(n_components=X.shape[1])
    #pca.fit(X)

    # Kernel PCA
    kpca = KernelPCA(n_components=X.shape[1])
    kpca.fit(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=size, random_state=0)
    return X_train, X_test, y_train, y_test, dim    

In [2]:
X_train, X_test, y_train, y_test, dim = readData('data/processed_nist_data.csv', 0.8)
for algorithm in ["ball_tree", "kd_tree", "brute"]:
    neigh = KNeighborsClassifier(n_neighbors=3, algorithm=algorithm)
    neigh.fit(X_train, y_train) 
    y_pred = neigh.predict(X_test)
    # Accuracy
    print(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)*100)

96.05
96.0
96.1


In [3]:
# Results
# raw pixels: ball tree 96.0%, kd tree 96.0%, brute 96.05%
# feature selection: ball tree 96.05%, kd tree 96.0%, brute 96.1%

In [5]:
X_train.shape


(8000, 545)