In [27]:
import mnist
import scipy.misc
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
import datetime
np.set_printoptions(suppress=True)
from os import listdir
from os.path import isfile, join
from math import*
from decimal import Decimal
from collections import defaultdict
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#scipy.misc.toimage(scipy.misc.imresize(images[0,:,:] * -1 + 256, 10.))

train_images = mnist.train_images()
train_labels = mnist.train_labels()

test_images = mnist.test_images()
test_labels = mnist.test_labels()

## Shift-and-scale normalization:

In [3]:
scaler              = StandardScaler()

scaled_train_images = scaler.fit_transform(train_images.reshape\
                                     ((train_images.shape[0], train_images.shape[1] * train_images.shape[2])))
scaled_test_images  = scaler.fit_transform(test_images.reshape\
                                     ((test_images.shape[0], test_images.shape[1] * test_images.shape[2])))

print ('Max: ' + str(scaled_test_images.max()))
print ('Min: ' + str(scaled_test_images.min()))
print ('Mean: %f' % (scaled_test_images.mean()))
print ('Variance: %f' % (scaled_test_images.var()))



Max: 99.994999875
Min: -1.31006468485
Mean: 0.000000
Variance: 0.852041


## Zero mean, unit variance:

In [4]:
print ('Original Max: ' + str(train_images.max()))
print ('Original Min: ' + str(train_images.min()))

scaler  = MinMaxScaler(feature_range=(0, 1))
scaled_train_images = scaler.fit_transform(train_images.reshape\
                                          ((train_images.shape[0], train_images.shape[1] * train_images.shape[2])))

print ('Max: ' + str(scaled_train_images.max()))
print ('Min: ' + str(scaled_train_images.min()))
print ('Mean: %f' % (scaled_train_images.mean()))
print ('Variance: %f' % (scaled_train_images.var()))

Original Max: 255
Original Min: 0




Max: 1.0
Min: 0.0
Mean: 0.130663
Variance: 0.094932


##  20 NG KNN

In [20]:
news_train = fetch_20newsgroups(data_home='../data/20newsgroups/', 
                                subset='train', 
                                remove=('headers', 'footers', 'quotes'))
news_test = fetch_20newsgroups(data_home='../data/20newsgroups/', 
                                subset='test', 
                                remove=('headers', 'footers', 'quotes'))

In [21]:
vectorizer                = CountVectorizer(binary=True, stop_words='english')
counts_train              = vectorizer.fit_transform(news_train.data)
counts_test               = vectorizer.transform(news_test.data)
cos_similarity_train      = cosine_similarity(counts_train)
cos_similarity_train_test = cosine_similarity(X=counts_test, Y=counts_train)

In [23]:
news_train_len = len(news_train.data)
news_test_len  = len(news_test.data)
k              = 5

def knn(index, k, similarity_matrix):
    return news_train.target[max(similarity_matrix[index].argsort()[-(k + 1):-1])]

train_predict = []

for doc_index in range(news_train_len):
    y = news_train.target[doc_index]
    yhat = knn(doc_index, k, cos_similarity_train)
    train_predict.append(y == yhat)
    
print ('The training accuracy is: ' + str(np.mean(train_predict)))
    
test_predict = []

for doc_index in range(news_test_len):
    y = news_test.target[doc_index]
    yhat = knn(doc_index, k, cos_similarity_train_test)
    test_predict.append(y == yhat)
    
print ('The testing accuracy is: ' + str(np.mean(test_predict)))

The training accuracy is: 0.423192504861
The testing accuracy is: 0.312267657993


In [24]:
choice_maker               = np.random.choice([True, False], len(scaled_train_images), p = [0.4, 0.6])
scaled_train_images_sample = scaled_train_images[choice_maker]
train_label_sample         = train_labels[choice_maker]

print(scaled_train_images_sample.shape)
print(train_label_sample.shape)

print(scaled_test_images.shape)
print(test_labels.shape)

(24022, 784)
(24022,)
(10000, 784)
(10000,)


In [28]:
euc_similarity_matrix_train  = euclidean_distances(X=scaled_train_images_sample, Y=scaled_train_images_sample)
cos_similarity_matrix_train  = cosine_similarity(X=scaled_train_images_sample, Y=scaled_train_images_sample)

In [30]:
euc_similarity_matrix_test  = euclidean_distances(X=scaled_test_images, Y=scaled_train_images_sample)
cos_similarity_matrix_test  = cosine_similarity(X=scaled_test_images, Y=scaled_train_images_sample)

In [20]:
#np.save('../data/similarity_matrix_coz', cos_similarity_matrix)
#cos_similarity_matrix = np.load('../data/similarity_matrix_coz.npy')



In [8]:
scaled_train_images_sample_array = np.array(scaled_train_images_sample)
scaled_test_images_array = np.array(scaled_test_images)
train_label_sample_array = np.array(train_label_sample)
test_labels_array = np.array(test_labels)

In [9]:
def mnist_knn(index, k, similarity_matrix):
    return np.array(similarity_matrix[index,:]).argsort()[-(k+1):-1]

## Cosine

In [62]:
predict = []
for index in range(len(scaled_train_images_sample)):
    
    #print (index)    
    y_select = train_label_sample_array[index]
    yhat = max(train_label_sample_array[mnist_knn(index, 5, cos_similarity_matrix_train)])
    predict.append(y_select == yhat)

In [65]:
np.mean(predict)

0.92794860027535564

In [16]:
test_predict = []
for index in range(len(scaled_test_images_array)):
    #print (mnist_knn(index, 5, cos_similarity_matrix_test))
    y_select = test_labels[index]
    yhat = max(train_label_sample_array[mnist_knn(index, 5, cos_similarity_matrix_test)])
    test_predict.append(y_select == yhat)

In [17]:
np.mean(test_predict)

0.91279999999999994

## Euclidean

In [35]:
test_predict = []
for index in range(len(scaled_test_images_array)):
    #print (mnist_knn(index, 5, cos_similarity_matrix_test))
    y_select = test_labels[index]
    yhat = min(train_label_sample_array[mnist_knn(index, 5, cos_similarity_matrix_test)])
    test_predict.append(y_select == yhat)

In [36]:
np.mean(test_predict)

0.84730000000000005