In [7]:
import mnist
import scipy.misc
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import numpy as np
import datetime
np.set_printoptions(suppress=True)
from os import listdir
from os.path import isfile, join


In [8]:
#scipy.misc.toimage(scipy.misc.imresize(images[0,:,:] * -1 + 256, 10.))

train_images = mnist.train_images()
train_labels = mnist.train_labels()

test_images = mnist.test_images()
test_labels = mnist.test_labels()

In [9]:
train_images.shape

(60000, 28, 28)

In [10]:
train_images.reshape((train_images.shape[0], train_images.shape[1] * train_images.shape[2]))

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

## Shift-and-scale normalization:

In [13]:
print ('Original Max: ' + str(train_images.max()))
print ('Original Min: ' + str(train_images.min()))

scaler  = MinMaxScaler(feature_range=(0, 1))
scaled_train_images = scaler.fit_transform(train_images.reshape\
                                          ((train_images.shape[0], train_images.shape[1] * train_images.shape[2])))

print ('Max: ' + str(scaled_train_images.max()))
print ('Min: ' + str(scaled_train_images.min()))
print ('Mean: %f' % (scaled_train_images.mean()))
print ('Variance: %f' % (scaled_train_images.var()))

Original Max: 255
Original Min: 0




Max: 1.0
Min: 0.0
Mean: 0.130663
Variance: 0.094932


## Zero mean, unit variance:

In [14]:
scaler              = StandardScaler()

scaled_train_images = scaler.fit_transform(train_images.reshape\
                                     ((train_images.shape[0], train_images.shape[1] * train_images.shape[2])))
scaled_test_images  = scaler.fit_transform(test_images.reshape\
                                     ((test_images.shape[0], test_images.shape[1] * test_images.shape[2])))

print ('Max: ' + str(scaled_test_images.max()))
print ('Min: ' + str(scaled_test_images.min()))
print ('Mean: %f' % (scaled_test_images.mean()))
print ('Variance: %f' % (scaled_test_images.var()))



Max: 99.994999875
Min: -1.31006468485
Mean: 0.000000
Variance: 0.852041


## Distances

In [19]:
from math import*
from decimal import Decimal
 
class Similarity():
  
    def euclidean_distance(self,x,y):
  
        return sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
 
    def manhattan_distance(self,x,y):
  
        return sum(abs(a-b) for a,b in zip(x,y))
 
    def minkowski_distance(self,x,y,p_value):
  
        return self.nth_root(sum(pow(abs(a-b),p_value) for a,b in zip(x, y)),
           p_value)
 
    def nth_root(self,value, n_root):
  
        root_value = 1/float(n_root)
        return round (Decimal(value) ** Decimal(root_value),3)
 
    def cosine_similarity(self,x,y):
  
        numerator = sum(a*b for a,b in zip(x,y))
        denominator = self.square_rooted(x)*self.square_rooted(y)
        return round(numerator/float(denominator),3)
 
    def square_rooted(self,x):
  
        return round(sqrt(sum([a*a for a in x])),3)
 
    def jaccard_similarity(self,x,y):
 
        intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
        union_cardinality = len(set.union(*[set(x), set(y)]))
        return intersection_cardinality/float(union_cardinality)

In [64]:
measures = Similarity()
 
print (measures.euclidean_distance([0,3,4,5],[7,6,3,-1]))
print (measures.jaccard_similarity([0,1,2,5,6],[0,2,3,5,7,9]))

9.746794344808963
0.375


In [65]:
def create_pairwise_distance_matrix(twod_matrix):
    
    similarity_matrix = np.zeros((twod_matrix.shape[0],twod_matrix.shape[0]))
    measures          = Similarity()
    
    i_count           = 0  
    
    for i in twod_matrix:
        

        j_count = 0
        
        for j in twod_matrix:
            
            if i_count == j_count: break
            if (i_count/twod_matrix.shape[0])% .1 == 0: print ('I count' + str(i_count/twod_matrix.shape[1]))
            similarity_matrix[i_count,j_count] = measures.euclidean_distance(i,j)
            j_count += 1    
        
        i_count += 1
        
    return similarity_matrix, i_count, j_count
    

In [15]:
choice_maker = np.random.choice([True, False], len(scaled_train_images), p = [0.2, 0.8])

In [16]:
scaled_train_images_sample = scaled_train_images[choice_maker]
scaled_train_label_sample = train_labels[choice_maker]

print(scaled_train_images_sample.shape)
print(scaled_train_label_sample.shape)

(12040, 784)
(12040,)


In [28]:
pre = datetime.datetime.now()

similarity_matrix_euc = create_pairwise_distance_matrix(scaled_train_images_sample)

post = datetime.datetime.now()
print ('Time taken: ' + str(post-pre))

I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0
I count1.0

I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0

I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0
I count2.0

I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0
I count3.0

I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0

I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0
I count4.0

Time taken: 0:28:43.458446


In [31]:
#np.save('../data/similarity_matrix_euc', similarity_matrix_euc)

In [28]:
similarity_matrix_euc = np.load('../../Assignments/data/similarity_matrix_euc.npy')

In [23]:
import matplotlib.pyplot as plt

In [157]:
clf.centroids

{0: array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.00441808, -0.00575482, -0.00408252,
        -0.00408252,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.00408252, -0.00470969, -0.00879935,
        -0.01159056, -0.01475898,  0.01542927,  0.13269762,  0.19674262,
         0.12217846,  0.011966  , -0.02247227,  0.06882604,  0.07424075,
        -0.01417358, -0.02311919, -0.01916663, -0.0167723 , -0.01099636,
        -0.00832486, -0.00438069,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , -0.00408252, -0.00539535,
        -0.00852241, -0.01198504,  0.23295809,  0.21990248,  0.20136728,
         0.14306985,  0.21874669,  0.28937609,  

In [78]:
scaled_test_images.shape

(10000, 784)

In [79]:
test_labels.shape

(10000,)

#### KNN