In [1]:
import tensorflow as tf
from tensorflow import keras
import math
import numpy as np
from sklearn.cluster import KMeans


## Import data

In [2]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path="mnist.npz")
x_train = x_train/255.
x_test = x_test/255.

## Build Autoencoder

In [3]:
from tensorflow.keras.layers import *

In [4]:
class ClusteringLayer(keras.layers.Layer):
    """
    Clustering layer converts input sample (feature) to soft label.

    # Example
    ```
        model.add(ClusteringLayer(n_clusters=10))
    ```
    # Arguments
        n_clusters: number of clusters.
        weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
        alpha: degrees of freedom parameter in Student's t-distribution. Default to 1.0.
    # Input shape
        2D tensor with shape: `(n_samples, n_features)`.
    # Output shape
        2D tensor with shape: `(n_samples, n_clusters)`.
    """

    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
        self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        """ student t-distribution, as same as used in t-SNE algorithm.        
                 q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
                 q_ij can be interpreted as the probability of assigning sample i to cluster j.
                 (i.e., a soft assignment)
        Arguments:
            inputs: the variable containing data, shape=(n_samples, n_features)
        Return:
            q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
        """
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1)) # Make sure each sample's 10 values add up to 1.
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters

    def get_config(self):
        config = {'n_clusters': self.n_clusters}
        base_config = super(ClusteringLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [5]:
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
def encoders(input_shape=(28,28,1),n_classes=10):
    enc_input = Input(input_shape,name='input_layer')
    enc = Conv2D(16,3,padding='same',data_format='channels_last',activation='relu',name='enc_1')(enc_input)
    enc = Conv2D(32,3,padding='same',data_format='channels_last',activation='relu',name='enc_2')(enc)
    enc = MaxPool2D(padding='same')(enc)
    enc = Conv2D(32,3,padding='same',data_format='channels_last',activation='relu',name='enc_3')(enc)
    shape = K.int_shape(enc)
    enc = Flatten()(enc)
    enc_output = Dense(n_classes,activation='relu')(enc)
    
    dec_input = Dense(np.prod(shape[1:]),activation='relu',name='dec_input')(enc_output)
    dec = Reshape(shape[1:],name='dec_1')(dec_input)
    dec = UpSampling2D(data_format='channels_last',name='dec_2')(dec)
    dec = Conv2D(32,3,padding='same',data_format='channels_last',activation='relu',name='dec_3')(dec)
    dec = Conv2D(16,3,padding='same',data_format='channels_last',activation='relu',name='dec_4')(dec)
    dec_output = Conv2D(1,3,padding='same',data_format='channels_last',activation='sigmoid',name='dec_output')(dec) 
    
    return Model(enc_input,dec_output,name='AE'), Model(enc_input,enc_output,name='encoder')

In [6]:
AE, encoder = encoders()

In the above line we created both the autoencoder to extract features from the images as well as well as the encoder ending at our 10 unit dense layer to append the clustering layer later. 

In [35]:
AE.compile(optimizer='adam',loss='mse')
AE.fit(x_train,x_train,batch_size=128,epochs=64)
AE.save_weights('weights')

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<tensorflow.python.keras.callbacks.History at 0x1e75a5e2280>

In [7]:
AE.compile(optimizer='adam',loss='mse')
AE.load_weights('weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x21a9e69aaf0>

In [8]:
n_clusters = 10
cluster_layer = ClusteringLayer(n_clusters,name='clustering')(encoder.output)

clusterer = Model(inputs=encoder.input,outputs=cluster_layer)
### This is the cluster model. The cluster layer is still a bit of a blackbox at this point. 

In [9]:
clusterer.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_layer (InputLayer)     [(None, 28, 28, 1)]       0         
_________________________________________________________________
enc_1 (Conv2D)               (None, 28, 28, 16)        160       
_________________________________________________________________
enc_2 (Conv2D)               (None, 28, 28, 32)        4640      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 32)        0         
_________________________________________________________________
enc_3 (Conv2D)               (None, 14, 14, 32)        9248      
_________________________________________________________________
flatten (Flatten)            (None, 6272)              0         
_________________________________________________________________
dense (Dense)                (None, 10)               

In [12]:
kmeans = KMeans(n_clusters=n_clusters,n_init=20)
y_pred = kmeans.fit_predict(encoder.predict(x_train))

In [13]:
from sklearn.metrics import normalized_mutual_info_score

normalized_mutual_info_score(y_pred, y_train)

0.7146772228320138

The above metric is the normalized mutual info score. It is assuming two sets are equivalent but with different values for classes. So a set of [0,0,1,1] and [1,1,0,0] has a score of 1.0 because it assumes both sets are describing the same thing just with different values for each class. 