TA's code

In [None]:
def evaluate(true_labels: np.ndarray, pred_labels: np.ndarray) -> tuple:
    """Entropy-based evaluation of a label assignment.

    Parameters:
      true_labels: the ground-truth class labels on the input data.
      pred_labels: the predicted class labels on the input data.

    Returns:
      a tuple (CM, (cs_e, cr_e, we)) containing the confusion matrix `CM`, the class entropies `cs_e`,
      the cluster entropies `cr_e`, and the averaged weighted entropies `we`.
    """
    from scipy.stats import entropy
    import numpy as np

    assert len(true_labels) == len(pred_labels), "Label predictions don't match"

    
    t_classes, t_labels = np.unique(true_labels, return_inverse=True)
    p_classes, p_labels = np.unique(pred_labels, return_inverse=True)
    assert np.all(np.isin(p_classes, t_classes)), "Predicted class outside of labels given"

    
    n_classes = len(t_classes)
    CM = np.zeros(shape=(n_classes, n_classes), dtype=np.uint32)
    ind = np.ravel_multi_index([t_labels, p_labels], CM.shape)
    np.add.at(CM.ravel(), ind, 1)

    
    def empirical_dist(x):
        return x / np.sum(x) if np.sum(x) > 0 else x

    cluster_entropy = np.apply_along_axis(lambda x: entropy(empirical_dist(x), base=2), 0, CM)
    class_entropy  = np.apply_along_axis(lambda x: entropy(empirical_dist(x), base=2), 1, CM)

    
    N = len(true_labels)
    w_cluster_entropy = np.sum(cluster_entropy * CM.sum(axis=0)) / N
    w_class_entropy   = np.sum(class_entropy  * CM.sum(axis=1)) / N
    w_entropies = np.array([w_class_entropy, w_cluster_entropy])

    with np.printoptions(precision=3):
        print(f"Class Entropies: {class_entropy}")
        print(f"Cluster Entropies: {cluster_entropy}")
        print(f"Weighted average entropies: {w_entropies}, (avg: {np.mean(w_entropies):.3f})")

    return CM, (w_class_entropy, w_cluster_entropy, w_entropies)


Gaussian Mixture soft kmeans

In [None]:
import numpy as np
from scipy.ndimage import gaussian_filter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from scipy.stats import entropy

data   = np.loadtxt('pb1data_XW_8358.txt', delimiter=',')
labels = data[:, 0].astype(int)
pixels = data[:, 1:].reshape(-1, 28, 28).astype(np.float32)

denoised = np.array([gaussian_filter(img, sigma=1) for img in pixels])
X_blur   = denoised.reshape(-1, 784)

scaler   = StandardScaler()
X_scaled = scaler.fit_transform(X_blur)

pca = PCA(n_components=0.4, whiten=True, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA dims = {X_pca.shape[1]} components")

gmm = GaussianMixture(
    n_components=10,
    covariance_type='full',
    n_init=10,
    max_iter=500,
    random_state=42
)
pred= gmm.fit_predict(X_pca)

CM, (cs_e, cr_e, w_entropies) = evaluate(labels, pred)
print("Weighted entropies: class={:.3f}, cluster={:.3f}, avg={:.3f}"
      .format(w_entropies[0], w_entropies[1], w_entropies.mean()))

PCA dims = 16 components
Confusion Matrix:
 [[  0  26   0   0   0  12   3   6  57 568]
 [679  31   6 843   3   7   5   1   5   0]
 [  1  30   7   1   4 195  24   6 219   3]
 [  2 562  21  10  18 121   0   4 278   2]
 [  2   5 452   1 413  32  29  11  14   2]
 [  0 374   9   4  14 108   6 626  42  30]
 [  2  23   1   8   0  31 618   9  15  21]
 [  4  11 125   7 338   9   1   1  19   0]
 [  6 187  12   8  43  61   4  44 126   4]
 [  1  22 266   8 360  19   0   2   7   1]]
Label Entropies:   [0.888 1.26  1.812 1.7   1.601 1.818 1.001 1.464 2.438 1.515]
Cluster Entropies: [0.238 2.116 1.804 0.442 1.956 2.666 0.705 0.791 2.422 0.659]
Weighted Entropies (class,cluster): [1.517 1.454] (avg: 1.485)

Weighted entropies: class=1.517, cluster=1.454, avg=1.485


Kmeans

In [None]:
import numpy as np
from scipy.ndimage import gaussian_filter
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

data   = np.loadtxt('pb1data_XW_8358.txt', delimiter=',')
labels = data[:,0].astype(int)

pixels = data[:,1:].reshape(-1,28,28).astype(np.float32)

denoised_imgs = np.array([gaussian_filter(img, sigma=1) for img in pixels])
X_blur = denoised_imgs.reshape(-1, 784)

scaler   = StandardScaler()
X_scaled = scaler.fit_transform(X_blur)

pca   = PCA(n_components=0.90, whiten=True, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print(f"PCA dims = {X_pca.shape[1]} components")

kmeans    = KMeans(n_clusters=10, n_init=100, max_iter=500, random_state=42)
labels_km = kmeans.fit_predict(X_pca)

CM_km, (cs_e, cr_e, w_entropies) = evaluate(labels, labels_km)
print("KMeans avg entropy:", w_entropies.mean())

PCA dims = 113 components
Confusion Matrix:
 [[  1   6  14   0 357   3  30  50 210   1]
 [  6   2   9 786   0   3  10   1   5 758]
 [ 11  10 326   4   4   7   8  33  67  20]
 [ 15  12  32  15   0  13  16 302 608   5]
 [283 209  32   6   4 369  24   4   3  27]
 [ 20  14  19  17   4  37 465 310 319   8]
 [  2   0 598   5  40   1   1   7  63  11]
 [ 39 240   5  12   1 185  10   0   7  16]
 [ 14  51   8  13   2  44  63  95 171  34]
 [155 181   3  12   3 305   2   3  17   5]]
Label Entropies:   [1.728 1.191 1.769 1.588 2.107 2.121 1.023 1.885 2.691 1.925]
Cluster Entropies: [1.963 2.188 1.655 0.733 0.815 2.092 1.49  2.002 2.322 0.989]
Weighted Entropies (class,cluster): [1.734 1.702] (avg: 1.718)

KMeans avg entropy: 1.7181191048421192


Answerr : 1.4 was the best i could get