In [None]:
import copy
import numpy as np
import sklearn
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras
import tensorflow as tf
from keras.utils import to_categorical
from keras.layers import Dense,Conv2D,Flatten,MaxPooling2D,Input,Reshape,UpSampling2D
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics.cluster import normalized_mutual_info_score
from keras.callbacks import History
from sklearn.linear_model import LogisticRegression
from keras.models import Model,Sequential
from sklearn.metrics import accuracy_score


# Read Fashion MNIST dataset using Keras
fashion_mnist = keras.datasets.fashion_mnist
(X_train, Y_train), (X_test,Y_test) = fashion_mnist.load_data() 

#normalise the dataset
X_train = (X_train / 255.0)
X_test = (X_test / 255.0)

# Scales the training and test data to range between 0 and 1.
X_train = np.float32(X_train)
X_test = np.float32(X_test)

#rescale the training and testing data with the maximum pixel value of the training and testing data
X_train = X_train / np.max(X_train)
X_test = X_test / np.max(X_test)

In [None]:
#K-means clustering - Task 1
# Read Fashion MNIST dataset using Keras
(X_train, Y_train), (X_test,Y_test) = fashion_mnist.load_data()
clusters = len(np.unique(Y_train))

nsamples_train, nx_train, ny_train = X_train.shape
X_train = X_train.reshape((nsamples_train, nx_train * ny_train))
nsamples_test, nx_test, ny_test = X_test.shape
X_test = X_test.reshape((nsamples_test, nx_test * ny_test))

#normalising the dataset
X_train = (X_train / 255.0)
X_test = (X_test / 255.0)

for n in range(1,8):
  #Run in parallel
  kmeans = KMeans(n_clusters=n, random_state=42, n_init=20, n_jobs=4, max_iter=600)
  kmeans.fit(X_train)
  sse = kmeans.inertia_ 
  print('SSE for #cluster = ', n, 'is',sse)

#Validating the model (Getting cluster labels)
labels = kmeans.fit_predict(X_test)

# Centroid values
centroids = kmeans.cluster_centers_

# Observe and compare clustering result with actual label using confusion matrix
y_test = copy.deepcopy(Y_test)
cm = confusion_matrix(Y_test, labels)

#visualize data
plt.figure(figsize=(5, 5))
ax = sns.heatmap(cm, annot=True, fmt="d")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title("Confusion matrix for simple K-means", fontsize=25)
plt.ylabel('True label', fontsize=20)
plt.xlabel('Clustering label', fontsize=20)
plt.show()

test_score = normalized_mutual_info_score(y_test,labels)
print('TESTING ACCURACY : '+str(test_score))
print("CLASSIFICATION REPORT")
print("----------------------------------------------------")
print(classification_report(Y_test,labels))

In [None]:
#building an Auto-encoder - Task 2
#reshape
X_train = X_train.reshape((len(X_train), 28, 28, 1))
X_test = X_test.reshape((len(X_test), 28, 28, 1))

#input dimension = 784
input_dim = X_train.shape[1]
encoding_dim = 32

compression_factor = float(input_dim) / encoding_dim
print("Compression factor: %s" % compression_factor)

autoencoder = Sequential()
# Encoder Layers
autoencoder.add(Conv2D(16, (3, 3), activation='relu', padding='same', input_shape=X_train.shape[1:]))
autoencoder.add(MaxPooling2D((2, 2), padding='same'))
autoencoder.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
autoencoder.add(MaxPooling2D((2, 2), padding='same'))
autoencoder.add(Conv2D(8, (3, 3), strides=(2,2), activation='relu', padding='same'))

# Flatten encoding for visualization
autoencoder.add(Flatten())
autoencoder.add(Reshape((4, 4, 8)))

# Decoder Layers
autoencoder.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
autoencoder.add(UpSampling2D((2, 2)))
autoencoder.add(Conv2D(8, (3, 3), activation='relu', padding='same'))
autoencoder.add(UpSampling2D((2, 2)))
autoencoder.add(Conv2D(16, (3, 3), activation='relu'))
autoencoder.add(UpSampling2D((2, 2)))
autoencoder.add(Conv2D(1, (3, 3), activation='sigmoid', padding='same'))

autoencoder.summary()


In [None]:
#extracting the encoder layer
encoder = Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('flatten_1').output)
encoder.summary()



#training the autoencoder
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder_train = autoencoder.fit(X_train, X_train,
                epochs=50,
                batch_size=128,
                validation_data=(X_test, X_test))

#plotting Training loss and Validation loss
loss = autoencoder_train.history['loss']
val_loss = autoencoder_train.history['val_loss']
epochs = range(50)
plt.figure()
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss',color = 'red')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
#K-Means clustering using autoencoder
#getting the encoded verson of train data
encoded_train = encoder.predict(X_train)

# Cluster the training set
kmeans = KMeans(n_clusters=10).fit(encoded_train)

#getting the encoded verson of test data
encoded_imgs = encoder.predict(X_test)
clustered_set = kmeans.fit_predict(encoded_imgs)
# Centroid values
centroids = kmeans.cluster_centers_

# Observe and compare clustering result with actual label using confusion matrix
y_test = copy.deepcopy(Y_test)
cm = confusion_matrix(Y_test, clustered_set)

#visualize data
plt.figure(figsize=(5, 5))
ax = sns.heatmap(cm, annot=True, fmt="d")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title("Confusion matrix for K-means model", fontsize=25)
plt.ylabel('True label', fontsize=20)
plt.xlabel('Clustering label', fontsize=20)
plt.show()

Y_pred = copy.deepcopy(clustered_set)
test_accuracy = normalized_mutual_info_score(y_test,Y_pred) 
print('TESTING ACCURACY: '+str(test_accuracy))
print("CLASSIFICATION REPORT")
print("----------------------------------------------------")
print(classification_report(Y_test,Y_pred))


In [None]:
#Gaussian Mixture Model using Autoencoder
#getting the encoded verson of train data
encoded_imgs_train = encoder.predict(X_train)
gmm = GaussianMixture(n_components=10).fit(encoded_imgs_train)

#getting the encoded verson of test data
encoded_imgs_test = encoder.predict(X_test)
#gmm.fit(encoded_imgs_test)
y_pred_gmm = gmm.fit_predict(encoded_imgs_test)
proba_lists = gmm.predict_proba(encoded_imgs_test)
print(y_pred_gmm.shape)
y_test = copy.deepcopy(Y_test)
cm = confusion_matrix(Y_test, y_pred_gmm)

#visualize the data
plt.figure(figsize=(5, 5))
ax = sns.heatmap(cm, annot=True, fmt="d")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title("Confusion matrix for GMM model", fontsize=25)
plt.ylabel('True label', fontsize=20)
plt.xlabel('Clustering label', fontsize=20)
plt.show()

Y_pred = copy.deepcopy(y_pred_gmm)
test_accuracy = normalized_mutual_info_score(y_test,y_pred_gmm)
print('TESTING ACCURACY: '+str(test_accuracy))        
print("CLASSIFICATION REPORT")
print("----------------------------------------------------")
print(classification_report(Y_test,Y_pred))