In [44]:
import numpy as np
import numpy.linalg as LA
from sklearn.cluster import MiniBatchKMeans
from sklearn.externals import six
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import timeit
from memory_profiler import memory_usage

class Vlad:
    def __init__(self, num_clusters = 8):
        self.num_clusters = num_clusters
        return

    def my_vlad(self, loc_desc, centroids, clusters):
        #print(centroids.shape, local_descriptors.shape)
        V = np.zeros([centroids.shape[0],loc_desc.shape[1]])
        local_descriptors = self.scaler.transform(loc_desc)
        #print(V.shape, centroids.shape, local_descriptors.shape)
        #distances = pairwise_distances(local_descriptors, centroids, metric='euclidean')
        #clusters = np.argmin(distances,axis=1)
        for iter, center in enumerate(centroids):
            points_belonging_to_cluster = local_descriptors[clusters == iter]
            V[iter] = np.sum(points_belonging_to_cluster - center, axis=0)
        V = V.reshape(1, V.shape[0]*V.shape[1])
        #print ('Vlad shape is ', V.shape)
        return V/LA.norm(V)

    def get_params(self, deep=True):
        return dict(num_clusters = self.num_clusters)

    def set_params(self, **params):
        if not params:
            return self
        for key, value in six.iteritems(params):
            split = key.split('__', 1)
            if len(split) > 1:
                print("length is greter than one ", split, value)
            else:
                print("length is one ", split, value)
                setattr(self, key, value)

    def fit(self, X, y=None):
        #print("in fit method", X.shape, y.shape, self.num_clusters)
        print "in fit method: X-"
        tmp = X.swapaxes(1,2)
        print tmp
        tmp = tmp.reshape(tmp.shape[0]*tmp.shape[1], tmp.shape[2])

        self.scaler = StandardScaler()
        self.scaler.fit(tmp)
        tmp = self.scaler.transform(tmp)

        kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=1000)
        kmeans.fit(tmp)

        self.centers = kmeans.cluster_centers_
        self.clusters = kmeans.labels_
        print("shape of centers is ",self.centers.shape)
        return self

    def transform(self, X):
        print("in transform method", X.shape, self.num_clusters)
        X = X.swapaxes(1,2)
        tot_range = X.shape[0]
        print("X.shape is ", X.shape)
        out = np.empty((tot_range, self.centers.shape[0]*X.shape[2]))
        print("starting for loop")
        start_ind = 0
        for i in range(tot_range):
            out[i] = self.my_vlad(X[i], self.centers, self.clusters[start_ind:start_ind + 500])
            start_ind = start_ind + 500

        out = np.insert(out, 0, 1, axis=1)
        print("out.shape is ", out.shape)
        return out

In [45]:
def profile_memory_and_time(function, *args, **kwargs):
    start_time = timeit.default_timer()
    memory, return_val = memory_usage((function, (args), kwargs), max_usage=True, retval=True)
    elapsed = timeit.default_timer() - start_time
    return memory[0], elapsed,return_val


In [46]:
#minibatch=MiniBatchKMeans(n_clusters=1024,max_iter=100,batch_size=3072)
inputSizesToGenerate =[[2**11,2**2]] 
#[[2**8, 32],[2**10, 32],[2**12, 32],[2**14, 32],[2**16, 32],[2**18, 32],[2**20, 32],[2**22, 32],[2**24, 32]]
#,[2**26, 32],[2**28, 32],[2**30, 32],[2**32, 32]]

scaler = StandardScaler()
plt.ion()
times=[]
memoris=[]
numSamples=[]
    #centroid_plot.savefig('plus_centroid_plot.png')
    #centroid_plot.savefig('plus_centroid_plot.png')
num_dimension=32

for num_samples, num_cluster in inputSizesToGenerate:
    myVlad = Vlad(num_cluster)
    print "Running for {0} samples of dimension {1}".format(num_samples, num_dimension)
    X,y = make_blobs(n_samples=num_samples, n_features=num_dimension, centers=num_cluster)
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    print X
    memory, time, rval = profile_memory_and_time(myVlad.fit,X)
    times.append(time)
    memoris.append(memory)
    numSamples.append(num_samples)
plt.scatter(numSamples, memoris,color="red",label="sklearn Kmeans")
plt.title("MEMORY")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
plt.savefig('memory.png')
plt.clf()
plt.scatter(numSamples, times,color="red",label="sklearn Kmeans")
plt.title("TIME")
plt.show()
plt.savefig('time.png')
print "Script ended. Results:"
print "numSamples: ",numSamples
print "vladMemory: ",memoris
print "vladTimes: ",times    

Running for 2048 samples of dimension 32
[[ -0.04891706  -6.62566446  -6.32454664 ...,  -2.41444381   0.26897432
    8.42518975]
 [  1.91794734   7.06581158  -1.54732982 ...,   0.88850454  -5.44064572
   -6.93485815]
 [  0.02953702  -4.84713559  -0.84230305 ...,  -7.42158033  -9.04460737
   -9.7263831 ]
 ..., 
 [  1.6438743   -4.92613846  -5.80285997 ...,  -1.70196687   1.53627599
    7.40410061]
 [  1.5616806   -4.8991518   -0.50734962 ..., -11.52878728 -10.93025235
   -7.57199572]
 [  0.77515772  -4.49888614  -2.61949948 ...,  -9.18554461  -9.33794819
  -10.26036523]]
in fit method: X.shape (2048, 32)
[[ -0.04891706   1.91794734   0.02953702 ...,   1.6438743    1.5616806
    0.77515772]
 [ -6.62566446   7.06581158  -4.84713559 ...,  -4.92613846  -4.8991518
   -4.49888614]
 [ -6.32454664  -1.54732982  -0.84230305 ...,  -5.80285997  -0.50734962
   -2.61949948]
 ..., 
 [ -2.41444381   0.88850454  -7.42158033 ...,  -1.70196687 -11.52878728
   -9.18554461]
 [  0.26897432  -5.44064572  -9.

ValueError: Number of samples smaller than number of clusters.