In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy.linalg as LA

In [2]:
df_train=pd.read_csv('fashionmnist/fashion-mnist_train.csv')
df_test=pd.read_csv('fashionmnist/fashion-mnist_test.csv')

In [3]:
# concat the above dfs and adjust the indices accordingly. They should be from 0 to 69999
df=pd.concat([df_train,df_test],ignore_index=True)

In [4]:
# Identify the columns with string values
string_cols = df.select_dtypes(include=['object']).columns
# Drop the columns with string values
df_numeric = df.drop(string_cols, axis=1)

In [5]:
df_numeric

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,0,0,0,0,0,0,0,0,0,0,...,32,23,14,20,0,0,1,0,0,0
69996,6,0,0,0,0,0,0,0,0,0,...,0,0,0,2,52,23,28,0,0,0
69997,8,0,0,0,0,0,0,0,0,0,...,175,172,172,182,199,222,42,0,1,0
69998,8,0,1,3,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
def nearest(pt, Q):
    min_dist_sq = 10**18
    closest_center = 0
    pt = np.array(pt, dtype=np.float64)  # Convert pt to a NumPy array of float64
    for c in Q:
        c = np.array(c, dtype=np.float64)  # Convert c to a NumPy array of float64
        dist_sq = (LA.norm(c - pt))**2
        if dist_sq < min_dist_sq:
            min_dist_sq = dist_sq
            closest_center = c
    return closest_center, min_dist_sq

In [7]:
def kmeans_cost(Q, dataset, wt):
    cost = 0
    dic = {}
    for c in Q:
        dic[tuple(c)] = []
    for p in dataset.values:  # Use the numeric dataset
        c, dp = nearest(p, Q)
        cost += dp
        dic[tuple(c)].append(p)
    return cost, dic

In [8]:
def leverage_sampling(data, red_size):
	print("svd started")
	u, s, v = np.linalg.svd(data)
	print("svd done")
	u = u[:, :64]
	norms = []
	N = data.shape[0]
	for j in range(N):
		norms.append((tuple(data[j]), np.linalg.norm(u[j,:])**2))
	norms_sorted = sorted(norms, key=lambda x: x[1], reverse=True)
	reduced_set = []
	for j in range(red_size):
		reduced_set.append(list(norms_sorted[j][0]))
	return reduced_set

In [9]:
wt = {}
for pt in df:
	wt[tuple(pt)] = 1

In [10]:
from sklearn.cluster import KMeans
cluster_model = KMeans(n_clusters=25, init='k-means++', random_state=0).fit(df)
centers = cluster_model.cluster_centers_
mod_centers = []
for j in centers:
	mod_centers.append(tuple(j))

  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
optimal_cost,dic = kmeans_cost(mod_centers,df_numeric,wt)

In [12]:
optimal_cost

116096773716.37888

In [13]:
# Assuming df_numeric is your original dataset
# Choose a subset size that fits in your available memory
subset_size = 10000

# Select a random subset of the dataset
subset_indices = np.random.choice(df_numeric.shape[0], size=subset_size, replace=False)
df_subset = df_numeric.iloc[subset_indices]

# Convert the DataFrame to a NumPy array
data_subset = df_subset.values

In [14]:
coreset_size = [400, 350, 300, 250, 200, 150]
coreset = leverage_sampling(data_subset, 500)
coreset = np.array(coreset)

svd started
svd done


In [15]:
for ssize in coreset_size:
    condensed_set = coreset[:ssize, :]
    print(condensed_set.shape)
    avg_cost = 0
    for itr in range(5):
        cluster_model = KMeans(n_clusters=25, init='k-means++', random_state=0).fit(condensed_set)
        centers = cluster_model.cluster_centers_
        mod_centers = []
        for j in centers:
            mod_centers.append(tuple(j))
        cost2, dic = kmeans_cost(mod_centers, df_subset, wt)
        avg_cost += cost2
    avg_cost = avg_cost / 5
    
    # Calculate optimal cost, reduction, and error
    reduction = ((df_subset.shape[0] - ssize) / df_subset.shape[0]) * 100
    error = (abs(avg_cost - optimal_cost) / optimal_cost) * 100
    
    print("optimal cost is --> ", optimal_cost)
    print("length of coreset --> ", ssize)
    print("sampling cost is --> ", avg_cost)
    print("reduction in dataset is --> ", reduction)
    print("error in clustering cost --> ", error)

(400, 785)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


optimal cost is -->  116096773716.37888
length of coreset -->  400
sampling cost is -->  27676388646.899097
reduction in dataset is -->  96.0
error in clustering cost -->  76.16093215948297
(350, 785)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


optimal cost is -->  116096773716.37888
length of coreset -->  350
sampling cost is -->  28386371835.8562
reduction in dataset is -->  96.5
error in clustering cost -->  75.54938787084359
(300, 785)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


optimal cost is -->  116096773716.37888
length of coreset -->  300
sampling cost is -->  29559533804.08214
reduction in dataset is -->  97.0
error in clustering cost -->  74.53888436530094
(250, 785)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


optimal cost is -->  116096773716.37888
length of coreset -->  250
sampling cost is -->  31477172745.348137
reduction in dataset is -->  97.5
error in clustering cost -->  72.88712533713814
(200, 785)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


optimal cost is -->  116096773716.37888
length of coreset -->  200
sampling cost is -->  32873154731.564796
reduction in dataset is -->  98.0
error in clustering cost -->  71.68469572472964
(150, 785)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


optimal cost is -->  116096773716.37888
length of coreset -->  150
sampling cost is -->  36490206750.31694
reduction in dataset is -->  98.5
error in clustering cost -->  68.56914659880086


In [17]:
# apply kmeans on this sampled points
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(condensed_set)

  super()._check_params_vs_input(X, default_n_init=10)


In [22]:
# give me an accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(df['label'], kmeans.predict(df))

# also give an F1 score
from sklearn.metrics import f1_score
f1_score(df['label'], kmeans.predict(df), average='weighted')



0.07942557604244514