In [4]:
import numpy as np
import time
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics.cluster import adjusted_rand_score as acc
from scipy.io import loadmat

In [5]:
# Load the USPS_sub.mat file (replace the path with the actual path on your machine)
digits = loadmat('/Users/bharath/Documents/ADM/Assignment5/Code&Data/USPS_sub.mat')

# Extract the feature matrix (X) and labels (y) from the loaded data
# Replace 'data_key' and 'label_key' with the actual keys in your .mat file
X = digits['data']
y = digits['label']

# If necessary, reshape the labels to a 1D array
y = np.ravel(y)




In [6]:
# Number of trials
n_trials = 5

In [7]:
# Results
accs = []
nmis = []
running_times = []

for trial in range(n_trials):
    # K-means clustering
    kmeans = KMeans(n_clusters=10, random_state=trial)
    
    # Start timing
    start_time = time.time()
    
    # Fit the model
    kmeans.fit(X)
    
    # End timing
    end_time = time.time()
    
    # Calculate metrics
    acc_value = acc(y, kmeans.labels_)
    nmi_value = nmi(y, kmeans.labels_)
    running_time = end_time - start_time
    
    # Append results
    accs.append(acc_value)
    nmis.append(nmi_value)
    running_times.append(running_time)



In [8]:
# Calculate average and standard deviation
avg_acc = np.mean(accs)
std_acc = np.std(accs)
avg_nmi = np.mean(nmis)
std_nmi = np.std(nmis)
avg_running_time = np.mean(running_times)
std_running_time = np.std(running_times)

In [9]:
print("K-means Clustering Results:")
print(f"Average Acc: {avg_acc}, Std Acc: {std_acc}")
print(f"Average NMI: {avg_nmi}, Std NMI: {std_nmi}")
print(f"Average Running Time: {avg_running_time}, Std Running Time: {std_running_time}")


K-means Clustering Results:
Average Acc: 0.31728887181175935, Std Acc: 0.02397677401492886
Average NMI: 0.4785060090771195, Std NMI: 0.02636470160554388
Average Running Time: 0.14132413864135743, Std Running Time: 0.02686845716306815


In [10]:
from scipy.cluster.hierarchy import linkage, fcluster

# Linkage criteria
criteria = ['single', 'complete', 'weighted', 'ward']

for criterion in criteria:
    # Start timing
    start_time = time.time()

    # Hierarchical clustering
    Z = linkage(X, method=criterion, metric='euclidean')
    
    # Cut the dendrogram to get 10 clusters
    cluster_labels = fcluster(Z, t=10, criterion='maxclust')
    
    # End timing
    end_time = time.time()
    
    # Calculate metrics
    acc_value = acc(y, cluster_labels)
    nmi_value = nmi(y, cluster_labels)
    
    # Calculate running time
    running_time = end_time - start_time

    # Print results
    print(f"\nHierarchical Clustering ({criterion.capitalize()}):")
    print(f"Acc: {acc_value}")
    print(f"NMI: {nmi_value}")
    print(f"Running Time: {running_time}")



Hierarchical Clustering (Single):
Acc: 1.5502612709029693e-05
NMI: 0.01752436468371601
Running Time: 0.07271289825439453

Hierarchical Clustering (Complete):
Acc: 0.21759559613437726
NMI: 0.42646243629848035
Running Time: 0.049822092056274414

Hierarchical Clustering (Weighted):
Acc: 0.19564417810688814
NMI: 0.40202703094583164
Running Time: 0.04734921455383301

Hierarchical Clustering (Ward):
Acc: 0.3780245398844174
NMI: 0.5531500593557892
Running Time: 0.04751276969909668
