# Q-Means and the MNIST dataset

In this notebook we study the performances of Q-means algorithm on the MNIST dataset. Specifically, since q-means reduces to the classical algorithm \delta-k-means we just tested the performance of \delta-k-means. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
import itertools
import random
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import pairwise_distances_argmin
from sklearn.metrics import pairwise_distances
import sklearn as sk
from mnist import MNIST
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.stats import mode
import random

from sklearn import datasets
digits = datasets.load_digits()



mndata = MNIST('/home/scinawa/workspace/hackedkit/python-mnist/data')
mndata = MNIST('/home/scinawa/workspace/hackedkit/python-mnist/data')
X_origin, y = mndata.load_training()
X_test_origin, y_test = mndata.load_testing()

Let's load the data in memory and perform LDA.

In [2]:
#X = digits.data / digits.data.max()
#y = digits.target

y = np.array(y)
y_test = np.array(y_test)
X_origin = np.array(X_origin)
X_test_origin = np.array(X_test_origin)

#X = sk.preprocessing.normalize(X)
#dimred = PCA(n_components=35)
dimred = LinearDiscriminantAnalysis()

X = dimred.fit_transform(X_origin, y)
X_test = dimred.transform(X_test_origin)




### Testing that kmeans indeed gives good values for the predictions..

In [None]:
def find_labels(predicted, correct):
    labels_regular = np.zeros_like(correct)
    for i in np.unique(correct):
        #pdb.set_trace()
        mask = (predicted == i)
        labels_regular[mask] = mode(correct[mask])[0]
    return labels_regular

kmeans = KMeans(n_clusters=10, random_state=0, max_iter=10).fit(X)
predicted_ = kmeans.predict(X_test)
import pdb
#pdb.set_trace()

labels = find_labels(predicted_, y_test)


print("The accuracy score of k-means after PCA is {:.3}%".format(accuracy_score(y_test, predicted)))


### Let's measure the value of Z on this dataset

In [None]:
pd = sklearn.metrics.pairwise.pairwise_distances(X, kmeans.cluster_centers_)
Z = max(np.ndarray.flatten(pd))
print(Z)

### Helper function

In [None]:
def label_with_delta(X,centers,delta): #give X (points) and centers: 2 numpy arrays
    labels = []
    count = 0 #to count the number of times we chose a random center
    for dist_array in pairwise_distances(X,centers): #dist_array is the array of distances between on element Xi of X and each cluster! 
        mindist = np.min(dist_array) #distance between Xi and its closest clusters
        normalmin = [np.argmin(dist_array)] # index of the clusters closest to Xi
        close_dist = set([dist for dist in dist_array if abs(dist-mindist)<delta]) #array of all distance of dist_array if they are delta-close to mindist 
        deltamin = [i for i, item in enumerate(dist_array) if item in close_dist] #index of delta-close centers 
        deltachoice = random.choice(deltamin) #choose randomly one of the delta-close centers
        labels.append(deltachoice)
        if deltamin!=normalmin:
            count+=1
    #print("DELTA K-MEANS: %d random choices of centers over %d"%(count,len(X)))
    return np.array(labels),count

def label_regular(X,centers):
    return pairwise_distances_argmin(X,centers)

def lossfunction(X,labels,centers):
    N = len(X)
    loss = 1/np.sqrt(N)*np.sum([np.linalg.norm(X[i]-centers[labels[i]]) for i in range(N)])
    return loss

### These are the two implementation of k-means

In [None]:
def find_clusters_regular(X, n_clusters, threshold, iterations=None):
    # 1. Randomly choose clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    step = 0
    loss = []
    
    while True:
        step+=1
        #print("step: {}".format(step))
        # 2a. Assign labels based on closest center
        labels = pairwise_distances_argmin(X, centers) #REGULAR KMEANS
        
        # 2b. Find new centers from means of points
        new_centers = np.array([X[labels == i].mean(0) for i in range(n_clusters)])
        loss_step = lossfunction(X,labels,new_centers)
        loss.append(loss_step)
        #if step==1:
            #print("step: {}".format(step))
            #plot_regular_clusters(X,labels,loss)
        
        if step>1:
            
            #if step%2==0:
                #print("step: {}".format(step))
                #print("loss_step: {}".format(loss_step))
                #print("LossDiff: {}".format(abs(loss_step - loss[-2])))
                #plot_regular_clusters(X,labels,loss)
                
            # 2c. Check for convergence //!!\\change for threshold on Loss
            if iterations == None:
                if abs(loss_step - loss[-2])<threshold:
                    #print("loss_step - loss[-2] = "+str(loss_step)+" - "+str(loss[-2])+" = "+str(loss_step - loss[-2]))
                    #print("step: {}".format(step))
                    #print("LossDiff: {}".format(abs(loss_step - loss[-2])))
                    #plot_regular_clusters(X,labels,loss)
                    break                   
            else:
                if step == iterations:
                    #print("END OF ITERATIONS")
                    #print("loss_step - loss[-2] = "+str(loss_step)+" - "+str(loss[-2])+" = "+str(loss_step - loss[-2]))
                    #print("step: {}".format(step))
                    #print("LossDiff: {}".format(abs(loss_step - loss[-2])))
                    #plot_regular_clusters(X,labels,loss)
                    break                    
        centers = new_centers
    return centers, labels


def find_clusters_delta(X, n_clusters, delta, threshold, iterations=None):
    # 1. Randomly choose clusters
    rng = np.random.RandomState(rseed)
    i = rng.permutation(X.shape[0])[:n_clusters]
    centers = X[i]
    step = 0
    random_choices = []
    loss = []
    
    while True:
        step+=1
        
        # 2a. Assign labels based on closest center
        labels, count = label_with_delta(X,centers,delta) #DELTA KMEANS
        random_choices.append(count)
        
        
        # 2b. Find new centers from means of points
        new_centers = np.array([X[labels == i].mean(0) for i in range(n_clusters)])
        loss_step = lossfunction(X,labels,new_centers)
        loss.append(loss_step)
        
        #if step==1:
            #print("step: {}".format(step))
            #plot_delta_clusters(X,labels,random_choices,loss)

        if step>1:
            
            #if step%2==0:
                #print("step: {}".format(step))
                #print("loss_step: {}".format(loss_step))
                #print("LossDiff: {}".format(abs(loss_step - loss[-2])))
                #plot_delta_clusters(X,labels,random_choices,loss)


            # 2c. Check for convergence //!!\\change for threshold on Loss
            if iterations == None:
                
                if abs(loss_step - loss[-2])<threshold:
                    #print("loss_step - loss[-2] = "+str(loss_step)+" - "+str(loss[-2])+" = "+str(loss_step - loss[-2]))
                    #print("step: {}".format(step))
                    #print("LossDiff: {}".format(abs(loss_step - loss[-2])))
                    #plot_delta_clusters(X,labels,random_choices,loss)
                    break
                    
            else:
                if step == iterations:
                    #print("END OF ITERATIONS")
                    #print("loss_step - loss[-2] = "+str(loss_step)+" - "+str(loss[-2])+" = "+str(loss_step - loss[-2]))
                    #print("step: {}".format(step))
                    #print("LossDiff: {}".format(abs(loss_step - loss[-2])))
                    #plot_delta_clusters(X,labels,random_choices,loss)
                    break
                
        centers = new_centers

    return centers, labels, random_choices

## The experiment: measuring accuracy for k-means and d-means
For some number of iteration we measure the accuracy of our classifier, removing the outliers.

In [None]:
iterations_set = np.linspace(3, 30, 27, dtype=int)
results_kmeans = []
results_dmeans = []
results_dmeans_2 = []
results_kmeans_original = []
results_dmeans_original = []


threshold = 0.000001 #useless
delta = 0.3
delta_2 = 0.4
delta_original = 0.1

n_clusters = 10

for iterations in iterations_set:
    print("Iteration %d "%iterations)
    
    accuracy_set_kmeans = []
    accuracy_set_dmeans = []
    accuracy_set_dmeans_2 = []
    accuracy_set_kmeans_original = []
    accuracy_set_dmeans_original = []
    
    for sample in range(4):
        rseed=random.randint(10,50) #random initialization must be the same for comparing regular and delta kmeans
        
        #KMEANS
        centroids_regular, labels_regular_ = find_clusters_regular(X, n_clusters, threshold, iterations=iterations)
        accuracy_set_kmeans.append(accuracy_score(y,find_labels(labels_regular_) ))
        
        
        #DMEANS
        centroids_delta, labels_delta_, _ = find_clusters_delta(X, n_clusters, delta, threshold, iterations=iterations)
        accuracy_set_dmeans.append(accuracy_score(y,find_labels(labels_delta_)))  
        
        
        # DMEANS 2
        centroids_delta_2, labels_delta_2_, _ = find_clusters_delta(X, n_clusters, delta_2, threshold, iterations=iterations)       
        accuracy_set_dmeans_2.append(accuracy_score(y,find_labels(labels_delta_2_)))  
        
        
        # KMEANS (No DR)
        centroids_delta_2, labels_delta_2_, _ = find_cluster_regular(X_original, n_clusters, delta_2, threshold, iterations=iterations)       
        accuracy_set_kmeans_original.append(accuracy_score(y,find_labels(labels_delta_2_)))  
        
        
        # DMEANS (No DR)
        centroids_delta_2, labels_delta_2_, _ = find_clusters_delta(X_original, n_clusters, delta_original, threshold, iterations=iterations)       
        accuracy_set_dmeans_original.append(accuracy_score(y,find_labels(labels_delta_2_)))  
        
        
        print("Execution {} | (PCA) k-means = {:.2%} - q-means = {:.2%} q-means_2 = {:.2%} - (no-dr) k-means {:.2%}  - (no-dr) d-means {:.2%} ".format(sample, accuracy_score(y,labels_regular), accuracy_score(y,labels_delta),accuracy_score(y,labels_delta_2)))    

    results_kmeans.append(np.average(accuracy_set_kmeans))
    results_dmeans.append(np.average(accuracy_set_dmeans))
    results_dmeans_2.append(np.average(accuracy_set_dmeans_2))
    results_kmeans_original.append(np.average(accuracy_set_kmeans_original))

# We test delta KMEANS on the NON DR dataset.

## Now we plot the accuracy for all the experiments on MNIST

In [None]:
iterations_set = np.insert(iterations_set, 0,0)
import pdb
pdb.set_trace

print(len(iterations_set))
print(iterations_set)
print("porcodio")

results_kmeans = [0] + results_kmeans
results_dmeans = [0] + results_dmeans
results_dmeans_2 = [0] + results_dmeans_2
results_kmeans_original = [0] + results_kmeans_original



print(results_kmeans)
print(results_dmeans)
print(results_dmeans_2)
print(results_kmeans_original)
import matplotlib.lines as mlines

#pdb.set_trace()

plot_k = plt.plot(iterations_set,results_kmeans,'b', label="(LDA) k-means")
plot_d = plt.plot(iterations_set,results_dmeans,'r', label="(LDA) q-means (\delta = 0.4)")
plot_d_2 = plt.plot(iterations_set,results_dmeans_2,'y', label="(LDA) q-means (\delta = 0.3)")
plot_k_non_dr = plt.plot(iterations_set,results_kmeans_original,'y', label="k-means")
#plot_d_non_dr = plt.plot(iterations_set,results_dmeans_original,'y', label="\delta-k-means")


#blue_line = mlines.Line2D(iterations_set, results_kmeans, color='#004c6d', marker='*', markersize=15, label='(LDA) k-means')
#red_line = mlines.Line2D(iterations_set, results_dmeans, color='#7fc0cd', marker='*', markersize=15, label='(LDA) \delta-k-means  0.3')
#yellow_line = mlines.Line2D(iterations_set, results_dmeans_2, color='#e5ffff', marker='.', markersize=15, label='(LDA) \delta-k-means 0.4')
#green_line = mlines.Line2D(iterations_set, results_kmeans_original, color='#35ffff', marker='.', markersize=15, label='\delta-k-means 0.5')


plt.xlabel('Iterations ')
plt.ylabel('Accuracy (%)')


plt.xlim(0, 37)
plt.ylim(0, 1)
plt.grid(True)

plt.legend(loc='lower right', ncol=1, shadow=True, fancybox=True ) #, labels=['kmeans', 'dmeans'])