# Image classification with MNIST data, solutions to exercises


## Loading and visualizing the data

First, let's load the dataset using keras helpers and visualize some images using pyplot

In [None]:
#The pylab inline below is something you may need to make images and plots visible in Jupyter, depending on your Anaconda setup
%pylab inline  
import numpy as np
import matplotlib.pyplot as pp
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1" #disable Tensorflow GPU usage, a simple example like this runs faster on CPU
import tensorflow as tf
from tensorflow import keras 

#load the MNIST dataset
mnist = keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()

#Scale the pixel intensity values to 0...1 from 0...255
#Fortunately, we don't a StandardScaler here
x_train, x_test = x_train / 255.0, x_test / 255.0

#check the shape: you should see that x_train is a 3D tensor, 
#with 60000 instances of 2D tensors 28x28 pixels 
print("shape: ",x_train.shape)

#because the keras layers we will use need explicitly defined pixel channel count as the fourth dimension,
#we reshape:
x_train=np.reshape(x_train,[x_train.shape[0],x_train.shape[1],x_train.shape[2],1])
print("new shape: ",x_train.shape)

#do the same for test data
x_test=np.reshape(x_test,[x_test.shape[0],x_test.shape[1],x_test.shape[2],1])


#visualize some of the images
pp.figure(1)
for i in range(8):
    pp.subplot(1,8,1+i)
    #imshow expects a 2d tensor, thus we pick the i:th image, full width and height, and the first and only color channel
    pp.imshow(x_train[i,:,:,0])


# Exercise 1: Adding more layers to the fully connected network
The first exercise was to simply add more layers. We only need one line of code per layer

In [None]:
#Let's import the layer types we need
from tensorflow.keras.layers import Dense   #fully connected layer
from tensorflow.keras.layers import Flatten #converts images to vectors of numbers

#As before, we use a simply sequential, i.e., multilayer architecture
model = keras.models.Sequential()

#Flatten converts a batch of multidimensional data into a batch of 1D data. 
#This is what the fully connected layers expect.
#For example, the rows of an image are simply stacked after each other.
#If the data was not images, we would not need this.
model.add(Flatten())

#This is the extra layer. You can try modifying the neuron and layer counts and see how the neuron weights and classification accuracy change
model.add(Dense(64, activation='relu'))

#The output layer is fully connected, with 1 neuron for each 10 classes.
#For classification, one should use the softmax activation.
#This means that each output neuron can be thought as the probability of a class.
model.add(Dense(10, activation='softmax'))

#Compile the model. We use sparse_categorical_crossentropy loss instead of categorical_crossentropy,
#because the label data contains indices instead of one-hot vectors
model.compile(loss=keras.losses.sparse_categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

#Train the network
model.fit(x_train, y_train,
          batch_size=32,
          epochs=5,
          verbose=1,
          validation_data=(x_test, y_test))

We can visualize the first layer weights similar to before. Note that for the second layer, we will have as many inputs as the previous layer has neurons, and the input tensors can no longer be interpreted as images. You could, however, try investigating those weights by synthesizing (optimizing) an input image that maximally excites a neuron. This is one of the exercises in the [Adversarial MNIST](AdversarialMNIST.ipynb) tutorial.

In [None]:
#Visualize some of the first layer neuron weights
#First, query the weights. We use index 1 because index 0 is the flatten layer
weights=model.layers[1].get_weights()[0]
#Create a figure with appropriate size
nNeuronsToVisualize=10
pp.figure(1,figsize=[nNeuronsToVisualize*2,2])
#Loop over the neurons
for i in range(nNeuronsToVisualize):
    #Weights is a 2D tensor where the first dimension indexes over data variables, second over neurons
    image=weights[:,i]
    #We must reshape back to an image
    image=np.reshape(image,[28,28])
    #Now we can display
    pp.subplot(1,nNeuronsToVisualize,1+i)
    pp.title("Neuron {}".format(i))
    pp.imshow(image)  

## A convolutional neural network 

For the rest of the exercises, we again train a convolutional neural network, which gives better classification accuracy.

In [None]:
#Let's import the layer types we need
from tensorflow.keras.layers import Dense   #fully connected layer
from tensorflow.keras.layers import Conv2D  #convolutional layer with 2D filters (for audio you would use 1D)
from tensorflow.keras.layers import Dropout #this mitigates overfitting

#As before, we use a simply sequential, i.e., multilayer architecture
model = keras.models.Sequential()

#Instead of using fully connected layers like before, we use convolutional ones.
#We use 5x5 pixel features, and use strides of 2x2 to drop resolution by a factor of 2 after each layer
model.add(Conv2D(16, kernel_size=(5, 5), strides=[2,2],
                 activation='relu',
                 input_shape=(28,28,1,)))
model.add(Conv2D(32, (5, 5), activation='relu', strides=[2,2]))
#After the previous two layers, we are at 7x7 pixel resolution instead of the original 28x28 pixels.
#Thus, 5x5 filters would not be meaningful, as they would encompass almost the whole images
model.add(Conv2D(32, (3, 3), activation='relu', strides=[2,2]))

#Now, we are at 3x3 pixel resolution and there's no point in doing convolutions anymore.
#Instead, we'll just add a small fully connected layer just like above
#Again, we first need to flatten from a batch of images to a batch of 1D tensors
model.add(Flatten())
#Some regularization
model.add(Dropout(0.5))
#One fully connected
model.add(Dense(32, activation='relu'))
#More regularization
model.add(Dropout(0.5))
#Last fully connected layer, with softmax activation, which is what one needs for classification.
#Softmax means that each output neuron can be thought as the probability of a class.
#We use 10 neurons because MNIST has 10 classes.
model.add(Dense(10, activation='softmax'))

#Compile the model. We use sparse_categorical_crossentropy loss instead of categorica_crossentropy,
#because the label data contains indices instead of one-hot vectors
model.compile(loss=keras.losses.sparse_categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

#Train the network
model.fit(x_train, y_train,
          batch_size=32,
          epochs=5,
          verbose=1,
          validation_data=(x_test, y_test))

Let's test the classifier with an images. 

In [None]:
#this is the test image
testIdx=0
#Show the image
print("Testing with image:")
pp.imshow(x_test[testIdx,:,:,0])
pp.show()
#We index by testIdx:testIdx+1 to pass a batch of one image to the network instead of just one image
classProbabilities=model.predict(x_test[testIdx:testIdx+1])
print("Predicted class probabilities: ",classProbabilities)
#np.argmax returns the index of the largest value in a Numpy tensor.
#np.max returns the largest value
print("Most probable class is {}, with probability {}".format(np.argmax(classProbabilities),np.max(classProbabilities)))


# Exercise 2: Test the classifier with synthetic images
The second exercise was to create an artificial image, to practice a bit more Numpy tensor manipulation. You could also consider using a Python library for drawing lines etc. into Numpy arrays, e.g., skimage

Here, we simply use np.zeros() to create images with all pixels zero, and then use tensor indexing to set some pixels to 1.

In [None]:
#MNIST images are 28x28 pixels
image1=np.zeros([28,28])
#Draw a vertical bar. Note: The usual convention expected by Numpy and Tensorflow is that 
#in 2D tensors representing images, the first dimension denotes vertical position and second denotes horizontal
image1[5:22,14:16]=1
#Visualize
pp.imshow(image1)
#Test classification. 
#Note: we reshape the single image into a batch. (Try running the code without to see what error you get!) 
#Reshaping does not change Tensor contents,
#it just changes the way contents are indexed
classProbabilities=model.predict(np.reshape(image1,[1,28,28,1]))
print("Predicted class probabilities: ",classProbabilities)
#np.argmax returns the index of the largest value in a Numpy tensor.
#np.max returns the largest value
print("Most probable class is {}, with probability {}".format(np.argmax(classProbabilities),np.max(classProbabilities)))


**Try changing the position of the number vertically or horizontally and running the code again!** Does it affect the classification? 

Theory says that a fully connected network is more sensitive to the position, but a convolutional neural network should care less about it.

You can change what network you use by re-running one of the network building and training cells above.

Also, **try drawing different numbers or patterns.** Can you fool the network with something that doesn't look like a number? This is called an adversarial image, and neural networks are quite prone to them, if trained with too little data that does not contain all the possible types of images and variations the network will be tested with. See also the [Adversarial MNIST tutorial](AdversarialMNIST.ipynb) for an example of how to optimize images to fool the network.  

# Exercise 3: Visualize the images with lowest correct class probabilities
This is a good way to gain insights into both 1) the quality of your dataset, and 2) what the network learns.

There is a few ways to do this. First, we show how to extract the correct class probabilities from the probability distributions predicted by the network. 

An alternative is to query the loss function values from the network. The cross-entropy loss is a distance metric between the real class probability distribution and the distribution output by the network. In case of fully known real classes, the real probability distributions are one-hot, i.e., the probability of the correct class is 1 and the other probabilities are zero. This means that finding the image with largest loss is equal to finding the image with lowest correct class probability (for the mathematically inclined, can you figure out why this is the case?). However, getting the loss values out from the network requires either [overriding some Keras callbacks](https://stackoverflow.com/questions/48118111/get-loss-values-for-each-training-instance-keras) or building the network a bit differently, as shown at the bottom of this notebook.

In [None]:
#We begin by compiling a 2D array that includes both image indices and correct classes
#We can get an 1D tensor of indices using np.arange
nImages = y_test.shape[0]
indices=np.arange(nImages)
#Now we can stack both the 1D tensor of indices and the 1D tensor of correct classes
#The axis=1 defines the dimension along which to stack
classesAndIndices=np.stack([indices,y_test],axis=1)
print("Classes and indices",classesAndIndices)

#Next, pass the whole test data through the network
#This will result in a 2D tensor containing a 1D tensor of class probabilities for each image
predictedProbabilities=model.predict(x_test)
print("Predicted probabilities: ",predictedProbabilities)

#Now, we can use the classes and indices to index the probabilities
correctClassProbabilities=predictedProbabilities[classesAndIndices[:,0],classesAndIndices[:,1]]
print("Correct class probabilities: ",correctClassProbabilities)


We can now find the minimum and display the corresponding image.

In [None]:
index=np.argmin(correctClassProbabilities)
print("Correct class {}, probability {}".format(y_test[index],correctClassProbabilities[index]))
pp.imshow(x_test[index,:,:,0])

Now, if we want to show multiple images, we can use np.argsort() to pick the indices

In [None]:
sortedIndices=np.argsort(correctClassProbabilities)
nImages=8
pp.figure(1,figsize=[nImages*3,3])
for i in range(nImages):
    pp.subplot(1,nImages,1+i)
    index=sortedIndices[i]
    pp.imshow(x_test[index,:,:,0])
    pp.title("{}, prob. {:1.5f}".format(y_test[index],correctClassProbabilities[index]))

Just out of curiosity, let's also show the images with high correct class probabilities

In [None]:
sortedIndices=np.argsort(correctClassProbabilities)
nImages=8
pp.figure(1,figsize=[nImages*3,3])
for i in range(nImages):
    pp.subplot(1,nImages,1+i)
    #we can use negative indexing to index the last elements. Since i starts from 0, we also need to have the "-1"
    index=sortedIndices[-i-1]
    pp.imshow(x_test[index,:,:,0])
    pp.title("{}, prob. {:1.5f}".format(y_test[index],correctClassProbabilities[index]))