In [5]:
import numpy as np
import numpy.linalg as LA
from sklearn.cluster import MiniBatchKMeans
from sklearn.externals import six
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import timeit
from memory_profiler import memory_usage
import math

class Bofw:
    def __init__(self, num_clusters = 8):
        self.num_clusters = num_clusters
        return

    def my_bofw(self, loc_desc, centroids, clusters):
        B = np.zeros(centroids.shape[0])
        local_descriptors = self.scaler.transform(loc_desc)
        for iter, center in enumerate(centroids):
            points_belonging_to_cluster = local_descriptors[clusters == iter]
            B[iter] = points_belonging_to_cluster.shape[0]
        return B

    def get_params(self, deep=True):
        return dict(num_clusters = self.num_clusters)

    def set_params(self, **params):
        if not params:
            return self
        for key, value in six.iteritems(params):
            split = key.split('__', 1)
            if len(split) > 1:
                print("length is greter than one ", split, value)
            else:
                print("length is one ", split, value)
                setattr(self, key, value)

    def fit(self, X, y=None):
        tmp = X.swapaxes(1,2)
        tmp = tmp.reshape(tmp.shape[0]*tmp.shape[1], tmp.shape[2])

        self.scaler = StandardScaler()
        self.scaler.fit(tmp)
        tmp = self.scaler.transform(tmp)
        kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.num_clusters, batch_size=1000)
        kmeans.fit(tmp)
        self.centers = kmeans.cluster_centers_
        self.clusters = kmeans.labels_
        print("shape of centers is ",self.centers.shape)
        return self

    def transform(self, X):
        print("in transform method", X.shape, self.num_clusters)
        X = X.swapaxes(1,2)
        tot_range = X.shape[0]
        print("X.shape is ", X.shape)
        out = np.empty((tot_range, self.centers.shape[0]))
        print("starting for loop")
        print tot_range
        start_ind = 0
        for i in range(tot_range):
            out[i] = self.my_bofw(X[i], self.centers, self.clusters[start_ind:start_ind + 500])
            start_ind = start_ind + 500
        out = np.insert(out, 0, 1, axis=1)
        return out

In [6]:
def profile_memory_and_time(function, *args, **kwargs):
    start_time = timeit.default_timer()
    memory, return_val = memory_usage((function, (args), kwargs), max_usage=True, retval=True)
    elapsed = timeit.default_timer() - start_time
    return memory[0], elapsed,return_val

In [7]:
inputSizesToGenerate =[[2**8,2**2],[2**9,2**2],[2**10,2**2],[2**11,2**2]] 
scaler = StandardScaler()
times=[]
memoris=[]
numSamples=[]
num_dimension=32

N_COMPONENT = 2
DATA_DIR = "../data/processed"
subjects = range(1, 5)

X =  np.concatenate([np.load("{0}/{1}/subj{2}_train_data.npy".format(DATA_DIR, N_COMPONENT, subject)) for subject in subjects])
print X.shape

(4608, 32, 500)


In [10]:

for num_samples, num_cluster in inputSizesToGenerate:
    X=X[:num_samples]
    myBofw = Bofw(num_cluster)
    print "Running for {0} samples of dimension {1}".format(num_samples, num_dimension)
    memory, time, rval = profile_memory_and_time(myBofw.fit,X)
    times.append(time)
    memoris.append(memory)
    numSamples.append(math.log(num_samples,10))
    
plt.scatter(numSamples, memoris,color="red")
plt.title("MEMORY")
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.savefig('memory.png')
plt.clf()
plt.scatter(numSamples, times,color="red")
plt.title("TIME")
plt.savefig('time.png')
print "Script ended. Results:"
print "numSamples: ",numSamples
print "bofwMemory: ",memoris
print "bofwTimes: ",times    

Running for 256 samples of dimension 32
('shape of centers is ', (4, 32))
Running for 512 samples of dimension 32
('shape of centers is ', (4, 32))
Running for 1024 samples of dimension 32
('shape of centers is ', (4, 32))
Running for 2048 samples of dimension 32
('shape of centers is ', (4, 32))
Script ended. Results:
numSamples:  [2.408239965311849, 2.7092699609758304, 3.0102999566398116, 3.311329952303793, 2.408239965311849, 2.7092699609758304, 3.0102999566398116, 3.311329952303793]
bofwMemory:  [652.87890625, 685.76953125, 686.609375, 686.609375, 701.08984375, 701.0, 700.91015625, 700.91015625]
bofwTimes:  [0.494920015335083, 0.4036719799041748, 0.42351484298706055, 0.5894830226898193, 0.4510960578918457, 0.44356608390808105, 0.43247199058532715, 0.32931995391845703]
