In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import rand_score
import time
import scipy.io

In [14]:
def projection(A, mat):
	u, s, v = np.linalg.svd(mat)
	if u.shape[1] < mat.shape[1]:
		u_padded = np.zeros((u.shape[0], mat.shape[1])) 
		u_padded[:, :u.shape[1]] = u
		u = u_padded
	u = u[:, :mat.shape[1]]
	component = (np.dot(u.T, np.dot(u, A.T))).T
	ans = A - component
	return ans

def volume_samp(A, t, s):
	E = A
	N = E.shape[0]
	S = {-1}
	for _ in range(t):
		probabilities = np.sum(E**2, axis=1) / np.sum(E**2)
		T_index = np.random.choice(N, int(s), p=probabilities)
		S.update(T_index)
		if -1 in S:
			S.remove(-1)
		S_list = list(S)
		E = projection(A, A[S_list])
	probabilities = np.sum(E**2, axis=1) / np.sum(E**2)
	probabilities = 1 / probabilities
	probabilities = probabilities[list(S)]
	probabilities = probabilities / np.sum(probabilities)
	return list(S), probabilities

In [15]:
def kmeans_cost(data, centers, labels):
    cost = 0.0
    for i in range(len(data)):
        distance = np.linalg.norm(data[i] - centers[labels[i]]) ** 2
        cost += distance
    return cost

In [17]:
def kmediod(data, weights, k, max_iterations=500):
    data = np.asarray(data)
    
    mins = data.min(axis=0)
    maxs = data.max(axis=0)
    centroids = np.random.rand(k, data.shape[1]) * (maxs - mins) + mins
    
    for _ in range(max_iterations):
        dist = cdist(data, centroids, 'cityblock')
        weighted_dist = dist * weights[:, np.newaxis]
        labels = np.argmin(weighted_dist, axis=1)
        
        for j in range(k):
            cluster = labels == j
            if weights[cluster].sum() > 0:
                centroids[j] = np.average(data[cluster], axis=0, weights=weights[cluster])
            else:
                centroids[j] = np.random.rand(1, data.shape[1]) * (maxs - mins) + mins
    
    return centroids

def predict(data, centroids):
    dist = cdist(data, centroids, 'cityblock')
    labels = np.argmin(dist, axis=1)
    return labels

def kmedoids_cost(data, centroids, labels):
    dist = cdist(data, centroids, 'cityblock')
    total_cost = np.sum(dist[np.arange(len(data)), labels])
    return total_cost

In [16]:
from tqdm import tqdm

def get_results(coreset_size, t_vals, n_clusters, X, optimal_labels, cost, data_name):
    results = []
    for ssize in coreset_size:
        for t in tqdm(t_vals, desc="Processing t values"):
            s = ssize//t
            avg_cost = 0
            start_time = time.time()
            rand_index = 0.0
            for _ in range(5):
                indices, weights = volume_samp(X, t, s)
                X_sample = X[indices]
                kmeans = KMeans(n_clusters=n_clusters, init='k-means++').fit(X_sample, sample_weight=weights)
                labels = kmeans.predict(X)
                centers = kmeans.cluster_centers_
                avg_cost += kmeans_cost(X, centers, labels)
                rand_index += rand_score(optimal_labels, labels)
            rand_index /= 5
            end_time = time.time()
            avg_cost /= 5
            reduction = ((X.shape[0] - X_sample.shape[0])/X.shape[0])*100
            error = (abs(avg_cost - cost)/cost)*100
            results.append({'Sampling Type': 'Volume Sampling (Weighted)',
                                'Coreset Size': X_sample.shape[0],
                                'Average Cost': avg_cost,
                                'Reduction in Data Size': reduction,
                                'Error': error,
                                'Avg Rand Index': rand_index,
                                'Data': data_name,
                                'Optimal Cost': cost,
                                'Avg Time': (end_time - start_time)/5,
                                'Num Iterations Volume Sampling': t})
    return results


In [18]:
from tqdm import tqdm

def get_results(coreset_size, t_vals, n_clusters, X, data_name):
    centers = kmediod(X, np.ones(X.shape[0]), n_clusters)
    optimal_labels = predict(X, centers)
    cost = kmedoids_cost(X, centers, optimal_labels)
    results = []
    for ssize in tqdm(coreset_size):
        for t in t_vals:
            s = ssize//t
            avg_cost = 0
            start_time = time.time()
            rand_index = 0.0
            for _ in range(5):
                indices, weights = volume_samp(X, t, s)
                X_sample = X[indices]
                centers = kmediod(X_sample, weights, n_clusters)
                labels = predict(X, centers)
                avg_cost += kmedoids_cost(X, centers, labels)
                rand_index += rand_score(optimal_labels, labels)
            rand_index /= 5
            end_time = time.time()
            avg_cost /= 5
            reduction = ((X.shape[0] - X_sample.shape[0])/X.shape[0])*100
            error = (abs(avg_cost - cost)/cost)*100
            results.append({'Sampling Type': 'Volume Sampling (Weighted)',
                                'Coreset Size': X_sample.shape[0],
                                'Average Cost': avg_cost,
                                'Reduction in Data Size': reduction,
                                'Error': error,
                                'Avg Rand Index': rand_index,
                                'Data': data_name,
                                'Optimal Cost': cost,
                                'Avg Time': (end_time - start_time)/5,
                                'Num Iterations Volume Sampling': t,
                                'Clustering Algorithm': 'KMedoids'})
    return results


In [28]:
mat_data = scipy.io.loadmat('olivettifaces.mat')
traindata = mat_data['faces'].T
traindata = pd.DataFrame(traindata)
traindata.dropna()
traindata.drop_duplicates()
traindata.shape
results = get_results([10, 20, 50, 70, 100], [1, 2, 5], 10, traindata.values, 'Face')

  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:20<01:20, 20.13s/it]



 40%|████      | 2/5 [00:45<01:08, 22.96s/it]











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 60%|██████    | 3/5 [01:31<01:07, 33.59s/it]











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 80%|████████  | 4/5 [02:30<00:43, 43.87s/it]











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































100%|██████████| 5/5 [03:53<00:00, 46.60s/it]


In [29]:
df = pd.read_csv('results.csv')
df = pd.concat([df, pd.DataFrame(results)], ignore_index=True)
df.to_csv('results.csv', index=False)