**Author:** Raoul Malm  

**Abstract:** CNN classifier for handwritten digits of the MNIST dataset. The dataset consists of 42000 images of size 28x28 = 784 pixels (one color number) including the corresponding labels from 0,..,9. The basic architecture of the NN is given by,

- Layer: input = [.,784]
- Layer: Conv1 -> ReLu -> MaxPool: [.,14,14,36] 
- Layer: Conv2 -> ReLu -> MaxPool: [.,7,7,36]
- Layer: Conv3 -> ReLu -> MaxPool: [.,4,4,36]
- Layer: FC -> ReLu: [.,576]
- Layer: FC -> ReLu: [.,10]

but can be easily modified in the code below.  

**Outline:**
1. Libraries and settings
2. Analyze and manipulate data
3. Build CNN graph in tensorflow
4. Train and validate CNN graph
5. Predict and submit test results

**Results:** 
- Using a split of 95%/5% on the labeled data this implementation, trained on 40.000 original and 240.000 augmented training images for 3 epochs with suitable hyperparameters, achieves a 99.60% accuracy on the validation set of 2000 images. This takes for the kaggle hardware roughly 30 minutes.
- Using all labeled data this implementation, trained on 42.000 original and 252.000 augmented training images for 3 epochs, achieves roughly a 99.43% accuracy on the public test set. This takes for the kaggle hardware roughly 30 minutes. 

**Reference:** [TensorFlow deep NN by Kirill Kliavin](http://https://www.kaggle.com/kakauandme/tensorflow-deep-nn?scriptVersionId=164725)


## 1. Libraries and settings
- import libraries
- set validation set size
- set CNN architecture: number and size of filters, number of neurons

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm # cm = colormap
import tensorflow as tf
%matplotlib inline
import os;
import itertools
from datetime import datetime
import cv2 

dir_logs = os.getcwd()+'/logs'; # directory to save models
val_set_size = 0; # validation set size (default = 2000)
s_f_conv1 = 3; # filter size of first convolution layer (default = 3)
n_f_conv1 = 36; # number of features of first convolution layer (default = 36)
s_f_conv2 = 3; # filter size of second convolution layer (default = 3)
n_f_conv2 = 36; # number of features of second convolution layer (default = 36)
s_f_conv3 = 3; # filter size of third convolution layer (default = 3)
n_f_conv3 = 36; # number of features of third convolution layer (default = 36)
n_n_fc1 = 576; # number of neurons of first fully connected layer (default = 576)
#tf.set_random_seed(10); # for reproducible results

#display parent directory and working directory
print(os.path.dirname(os.getcwd())+':', os.listdir(os.path.dirname(os.getcwd())));
print(os.getcwd()+':', os.listdir(os.getcwd()));

## 2. Analze and manipulate data
- load and check data
- normalize data and split into training and validation sets
- augment training data

In [None]:
## load and check data

if os.path.isfile('../input/train.csv'):
    data = pd.read_csv('../input/train.csv') # on kaggle 
    print('train.csv loaded: data({0[0]},{0[1]})'.format(data.shape))
elif os.path.isfile('data/train.csv'):
    data = pd.read_csv('data/train.csv') # on local environment
    print('train.csv loaded: data({0[0]},{0[1]})'.format(data.shape))
else:
    print('Error: train.csv not found')

# basic info about data
print('')
print(data.info())

# no missing values
print('')
print(data.isnull().any().describe())

# 10 different labels ranging from 0 to 9
print('')
print('distinct labels ', data['label'].unique())

# data are approximately balanced (less often occurs 5, most often 1)
print('')
print(data['label'].value_counts())

In [None]:
## normalize data and split into training and validation sets

# extract and normalize images
images = data.iloc[:,1:].values.reshape(-1,28,28,1) # (42000,28,28,1) array
images = images.astype(np.float) # convert from int64 to float32
images = np.multiply(images, 1.0 / 255.0) # convert from [0:255] to [0.0:1.0]
image_size = 784
image_width = image_height = 28
#image_size = images.shape[1] # = 784
#image_width = image_height = np.ceil(np.sqrt(image_size)).astype(np.uint8) # = 28

# extract image labels
labels_flat = data.iloc[:,0].values # (42000,1) array
labels_count = np.unique(labels_flat).shape[0]; # number of different labels = 10

#plot some images and labels
plt.figure(figsize=(15,2))
for i in range(0,10):
    plt.subplot(2,10,1+i)
    plt.title(labels_flat[i])
    plt.imshow(images[i].reshape(28,28),cmap=cm.binary)
    
# convert class labels from scalars to one-hot vectors e.g. 1 => [0 1 0 0 0 0 0 0 0 0]
def dense_to_one_hot(labels_dense, num_classes):
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
    return labels_one_hot

# labels in one hot representation
labels = dense_to_one_hot(labels_flat, labels_count).astype(np.uint8)
#labels = labels.astype(np.uint8)

# split data into training & validation
perm_array = np.arange(images.shape[0]) 
np.random.shuffle(perm_array) # shuffle train/valid sets
train_images = images[perm_array[val_set_size:]]
train_labels = labels[perm_array[val_set_size:]]
val_images = images[perm_array[:val_set_size]]
val_labels = labels[perm_array[:val_set_size]]

print('images{0},'.format(images.shape))
print('labels_flat{0}'.format(labels_flat.shape))
print('train_images{0}'.format(train_images.shape))
print('train_labels{0}'.format(train_labels.shape))
print('val_images{0}'.format(val_images.shape))
print('val_labels{0}'.format(val_labels.shape))
print ('image_size = {0}, image_width = {1}, image_height = {2}, labels_count = {3}'.format(
    image_size,image_width,image_height,labels_count))


In [None]:
## augment training data

# function to generate new images by rotation, translation, zooming 
def generate_image(img, op = -1):
    img_new = img;
    #randomize operation
    if op == -1:
        op = np.random.randint(6)
    
    # rotation
    if op == 0:
        angle = 10 - 20*np.random.randint(2) # +- 10 degrees
        M_rotate = cv2.getRotationMatrix2D((14,14),angle,1)
        img_new = cv2.warpAffine(img,M_rotate,(28,28))

    # translation horizontal, vertical
    if op == 1:
        shift_horizontal = 2 - 4*np.random.randint(2) # +- 2 pixels
        shift_vertical = 0;
        M_translate = np.float32([[1,0,shift_horizontal],[0,1,shift_vertical]])
        img_new = cv2.warpAffine(img,M_translate,(28,28))
   
    # translation vertical
    if op == 2:
        shift_horizontal = 0;
        shift_vertical = 2 - 4*np.random.randint(2) # +- 2 pixels
        M_translate = np.float32([[1,0,shift_horizontal],[0,1,shift_vertical]])
        img_new = cv2.warpAffine(img,M_translate,(28,28))
 
    # translation diagonal positive
    if op == 3:
        shift_diagonal = 2 - 4*np.random.randint(2) # +- 2 pixels
        M_translate = np.float32([[1,0,shift_diagonal],[0,1,shift_diagonal]])
        img_new = cv2.warpAffine(img,M_translate,(28,28))
    
    # translation diagonal negative
    if op == 4:
        shift_diagonal = 2 - 4*np.random.randint(2) # +- 2 pixels
        M_translate = np.float32([[1,0,shift_diagonal],[0,1,-shift_diagonal]])
        img_new = cv2.warpAffine(img,M_translate,(28,28))
   
    # zoom
    if op == 5:
        rd = np.random.randint(2);
        if rd == 0:
            # zoom in
            img_new = cv2.resize(img, (32,32)) # scale by 2 pixels in each direction
            img_new = img_new[2:-2,2:-2] # crop
        else:
            # zoom out
            img_new = cv2.resize(img, (24,24)) # scale by 2 pixels in each direction
            img_new = cv2.copyMakeBorder(img_new,2,2,2,2,cv2.BORDER_CONSTANT,value=0.0) # zero padding
            
    return img_new.reshape(28,28,1).astype(np.float)

def augment_data(x_data, y_data):
    x_data_generated = []; # list for generated images
    y_data_generated = []; # list for labels of generated images
    
    for i in range(train_images.shape[0]):
        # x_data_generated.append(generate_image(x_data[i]))
        # y_data_generated.append(y_data[i])
        x_data_generated.append(generate_image(x_data[i],0)) # rotation
        y_data_generated.append(y_data[i])
        x_data_generated.append(generate_image(x_data[i],1)) # translation horizontal
        y_data_generated.append(y_data[i])
        x_data_generated.append(generate_image(x_data[i],2)) # translation vertical
        y_data_generated.append(y_data[i])
        x_data_generated.append(generate_image(x_data[i],3)) # translation diagonal positive
        y_data_generated.append(y_data[i])
        x_data_generated.append(generate_image(x_data[i],4)) # translation diagonal negative
        y_data_generated.append(y_data[i])
        x_data_generated.append(generate_image(x_data[i],5)) # zooming
        y_data_generated.append(y_data[i])

        
    x_data_generated = np.array(x_data_generated);
    y_data_generated = np.array(y_data_generated);
    
    #print('x_data_generated.shape = ', x_data_generated.shape)
    #print('y_data_generated.shape = ', y_data_generated.shape)
    
    x_data_augmented = np.concatenate((x_data, x_data_generated),axis=0)
    y_data_augmented = np.concatenate((y_data, y_data_generated),axis=0)
    
    return x_data_augmented, y_data_augmented

# augment data
train_images, train_labels = augment_data(train_images, train_labels)

print('Data augmentation:')
print('train_images.shape',train_images.shape)
print('train_labels.shape',train_labels.shape)


In [None]:
## check augmented data
plt.figure(figsize=(15,2))
plt.subplot(2,10,1)
plt.title(train_labels[0+11])
plt.imshow(train_images[0+11].reshape(28,28),cmap=cm.binary) 
plt.subplot(2,10,2)
plt.title(train_labels[40000+11])
plt.imshow(train_images[40000+11].reshape(28,28),cmap=cm.binary) 


In [None]:
## check data augmentation
img = images[6].reshape(28,28)
plt.figure(figsize=(15,2))
for i in range(0,10):
    plt.subplot(2,10,1+i)
    plt.title(labels_flat[i])
    if i == 0:
        plt.imshow(img,cmap=cm.binary)
    else:
        plt.imshow(generate_image(img).reshape(28,28),cmap=cm.binary)

## 3. Build TensorFlow graph
- create the neural net architecture with convolutional and fully connected layers

In [None]:
## Build the TensorFlow Graph

#tf.set_random_seed(1)
#np.random.seed(1)

# weight and bias initialization
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape) #  positive bias
    return tf.Variable(initial)

# 2D convolution
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

# max pooling
def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

# variables for input and output 
x = tf.placeholder('float', shape=[None, image_height, image_width, 1])
y_ = tf.placeholder('float', shape=[None, labels_count])

# 1. layer: convolution + max pooling
#image = tf.reshape(x, [-1,28,28,1]) # (.,784) => (.,28,28,1)
W_conv1 = weight_variable([s_f_conv1, s_f_conv1, 1, n_f_conv1]) # (5,5,1,32)
b_conv1 = bias_variable([n_f_conv1]) # (32)
h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1) # => (.,28,28,32)
h_pool1 = max_pool_2x2(h_conv1) # => (.,14,14,32)

# 2. layer: convolution + max pooling
W_conv2 = weight_variable([s_f_conv2, s_f_conv2, n_f_conv1, n_f_conv2])
b_conv2 = bias_variable([n_f_conv2])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2) # => (.,14,14,32)
h_pool2 = max_pool_2x2(h_conv2) # => (.,7,7,32)

# 3. layer: convolution + max pooling
W_conv3 = weight_variable([s_f_conv3, s_f_conv3, n_f_conv2, n_f_conv3])
b_conv3 = bias_variable([n_f_conv3])
h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3) # => (.,7,7,32)
h_pool3 = max_pool_2x2(h_conv3) # => (.,4,4,32)

# 5.layer: fully connected
W_fc1 = weight_variable([4*4*n_f_conv3,n_n_fc1]) # (4*4*32, 1024)
b_fc1 = bias_variable([n_n_fc1]) # (1024)
h_pool3_flat = tf.reshape(h_pool3, [-1,4*4*n_f_conv3]) # (.,4,4,32) => (.,1024)
h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1) # => (.,1024)

# dropout
tf_keep_prob = tf.placeholder('float')
h_fc1_drop = tf.nn.dropout(h_fc1, tf_keep_prob)

# 4.layer: fully connected
W_fc2 = weight_variable([n_n_fc1, labels_count])
b_fc2 = bias_variable([labels_count])
y = tf.matmul(h_fc1_drop, W_fc2) + b_fc2 # => (.,10)

# cost function
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

# optimisation function
global_step = tf.Variable(0, trainable=False)
tf_learn_rate = tf.placeholder(dtype='float', name="tf_learn_rate")
train_step = tf.train.AdamOptimizer(tf_learn_rate).minimize(cross_entropy)

# evaluation
correct_prediction = tf.equal(tf.argmax(tf.nn.softmax(y),1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

# prediction function
predict = tf.argmax(tf.nn.softmax(y),1) # [0.1, 0.9, 0.2, 0.1, 0.1 0.3, 0.5, 0.1, 0.2, 0.3] => 1

# function: to get the next mini batch
def next_batch(batch_size):
    global train_images, train_labels, index_in_epoch, perm_array, train_set_size;
    assert batch_size <= train_set_size
 
    start = index_in_epoch
    index_in_epoch += batch_size
    
    if index_in_epoch > train_set_size:
        np.random.shuffle(perm_array) # shuffle data
        start = 0 # start next epoch
        index_in_epoch = batch_size
        
    end = index_in_epoch
    return train_images[perm_array[start:end]], train_labels[perm_array[start:end]]

print('# weights = ', s_f_conv1**2*n_f_conv1 + s_f_conv2**2*n_f_conv1*n_f_conv2 + 
      s_f_conv3**2*n_f_conv2*n_f_conv3 + 4*4*n_f_conv3*n_n_fc1 + n_n_fc1*10)
print('# biases = ', n_f_conv1 + n_f_conv2 + n_f_conv3 + n_n_fc1)

## 4. Training and Validation
- train the model
- visualize the results, the weights, the activations
- tune the hyperparameters

In [None]:
## set parameters

sess = tf.InteractiveSession() # start TensorFlow session
sess.run(tf.global_variables_initializer()) # initialize global variables

# variables and parameters
train_set_size = train_images.shape[0]
perm_array = np.arange(train_set_size)
np.random.shuffle(perm_array)
index_in_epoch = 0;
train_acc, val_acc, train_loss, val_loss = np.array([]),np.array([]),np.array([]),np.array([]);  
log_step = 100; # log results each step
epoch_no = 3; # no of epochs 

# test hyperparameters
mb_size_range = [50]; # mini batch size
keep_prob_range = [0.33]; # dropout regularization with keeping probability
learn_rate_range = [10*1e-4, 7.5*1e-4, 5*1e-4, 2.5*1e-4, 1*1e-4, 1*1e-4, 1*1e-4, 0.75*1e-4, 
                    0.5*1e-4, 0.25*1e-4, 0.1*1e-4, 0.1*1e-4, 0.075*1e-4, 0.050*1e-4,
                    0.025*1e-4, 0.01*1e-4, 0.0075*1e-4, 0.0050*1e-4, 0.0025*1e-4, 0.001*1e-4];
learn_rate_step = 0.3;

In [None]:
## train the TensorFlow graph

start = datetime.now();

for mb_size,keep_prob in itertools.product(mb_size_range,keep_prob_range):
    
    mb_no = int(np.floor(epoch_no*train_set_size/mb_size)); # no of mini batches
    learn_rate_step = int(np.floor(learn_rate_step*train_set_size/mb_size)); # steps in batches
    print('epoch_no = %.0f, mb_size = %.0f, keep_prob = %.2f'%(epoch_no,mb_size,keep_prob))
    learn_rate_pos = -1;
    
    for i in range(0,mb_no+1):
        
        if (i%learn_rate_step == 0) and ((learn_rate_pos+1) < len(learn_rate_range)):
            learn_rate_pos+=1;
            learn_rate = learn_rate_range[learn_rate_pos]  # adapt learn_rate
            print('set current learn rate to: %.6f'%learn_rate)
        
        #learn_rate = 0.001*1e-4;
        
        batch_xs, batch_ys = next_batch(mb_size) #get new batch
        
        if i > 0:
             sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys, 
                                                tf_keep_prob: keep_prob, 
                                                tf_learn_rate: learn_rate})
        if i%log_step == 0 or i == mb_no:
            
            if val_set_size > 0:
                train_loss = np.append(train_loss, sess.run(cross_entropy, 
                                feed_dict={x:train_images[0:val_set_size], 
                                           y_:train_labels[0:val_set_size], 
                                           tf_keep_prob:1.0}));
                
                train_acc = np.append(train_acc, 
                                accuracy.eval(feed_dict={x:train_images[0:val_set_size], 
                                                         y_:train_labels[0:val_set_size], 
                                                         tf_keep_prob:1.0}));      
                
                val_loss = np.append(val_loss, sess.run(cross_entropy, 
                                feed_dict={x:val_images, y_: val_labels, tf_keep_prob: 1.0}));
                
                val_acc = np.append(val_acc, accuracy.eval(feed_dict={x: val_images, 
                                                            y_: val_labels,tf_keep_prob: 1.0}));                                  
            else:
                train_loss = np.append(train_loss, sess.run(cross_entropy, 
                                    feed_dict={x:train_images[0:2000], 
                                        y_:train_labels[0:2000], tf_keep_prob:1.0}));
                
                train_acc = np.append(train_acc, accuracy.eval(feed_dict={x:train_images[0:2000], 
                                                                          y_:train_labels[0:2000], 
                                                                          tf_keep_prob:1.0}));      
                
                val_loss = [0]; val_acc = [0];
                
            print('%.2f epoch: train/val loss = %.4f/%.4f , train/val acc = %.4f/%.4f'
                  %(i*mb_size/train_set_size,train_loss[-1],val_loss[-1],train_acc[-1], 
                    val_acc[-1]));

    # save model
    #if not os.path.exists(dir_logs): # check if directory for logs exists
    #    os.makedirs(dir_logs)
    #np.savez(dir_logs+'/model.npz', 
    #        learn_rate = learn_rate, keep_prob = keep_prob, mb_size = mb_size, log_step = log_step,
    #        W_conv1 = np.asarray(W_conv1.eval()), b_conv1 = np.asarray(b_conv1.eval()), W_conv2 = np.asarray(W_conv2.eval()),
    #        b_conv2 = np.asarray(b_conv2.eval()), W_fc1 = np.asarray(W_fc1.eval()), b_fc1 = np.asarray(b_fc1.eval()),
    #        W_fc2 = np.asarray(W_fc2.eval()), b_fc2 = np.asarray(b_fc2.eval()),
    #        train_loss = train_loss, val_loss = val_loss, train_acc = train_acc,
    #        val_acc = val_acc, val_loss_final = val_loss_final, val_acc_final = val_acc_final);

    #close session
    #sess.close();

print('training time: ', datetime.now()-start)


In [None]:
'''
## load model

#print(dir_logs + ': ' + str(os.listdir(dir_logs)))
print('load '+ dir_logs + '/model.npz')
npzFile = np.load(dir_logs+'/model.npz');
#print(npzFile.files);
learn_rate = npzFile['learn_rate'];
keep_prob = npzFile['keep_prob'];
mb_size = npzFile['mb_size'];
log_step = npzFile['log_step'];
train_loss = npzFile['train_loss'];
val_loss = npzFile['val_loss'];
train_acc = npzFile['train_acc'];
val_acc = npzFile['val_acc'];
val_loss_final = npzFile['val_loss_final'];
val_acc_final = npzFile['val_acc_final'];

sess = tf.InteractiveSession() # start TensorFlow session
#sess.run(tf.global_variables_initializer()) # initialiue global variables
W_conv1.load(npzFile['W_conv1'], session=sess)
b_conv1.load(npzFile['b_conv1'], session=sess)
W_conv2.load(npzFile['W_conv2'], session=sess)
b_conv2.load(npzFile['b_conv2'], session=sess)
W_fc1.load(npzFile['W_fc1'], session=sess)
b_fc1.load(npzFile['b_fc1'], session=sess)
W_fc2.load(npzFile['W_fc2'], session=sess)
b_fc2.load(npzFile['b_fc2'], session=sess)
'''

In [None]:
## confusion matrix
if val_set_size > 0:
    y_predict = sess.run(tf.argmax(y,1), feed_dict={x: val_images,tf_keep_prob: 1.0});
    y_target = sess.run(tf.argmax(val_labels,1));
    print('confusion matrix:')
    print(sess.run(tf.contrib.metrics.confusion_matrix(predictions = y_predict, 
                                                       labels = y_target)))

In [None]:
## final loss, accuracy 

if val_set_size > 0:
    val_loss_final = sess.run(cross_entropy, feed_dict={x: val_images,y_: val_labels, 
                                                        tf_keep_prob: 1.0});        
    val_acc_final = accuracy.eval(feed_dict={x: val_images, y_: val_labels, tf_keep_prob: 1.0})
    print('final: val_loss = %.4f, val_acc = %.4f'%(val_loss_final,val_acc_final))

plt.figure(figsize=(10, 5));
plt.subplot(1,2,1);
plt.plot(np.arange(0,len(train_acc))*log_step*mb_size/train_set_size, train_acc,'-b', 
         label='Training')
plt.plot(np.arange(0,len(val_acc))*log_step*mb_size/train_set_size, val_acc,'-g', 
         label='Validation')
plt.legend(loc='lower right', frameon=False)
plt.ylim(ymax = 1.1, ymin = 0.0)
plt.ylabel('accuracy')
plt.xlabel('epoch');

plt.subplot(1,2,2)
plt.plot(np.arange(0,len(train_loss))*log_step*mb_size/train_set_size, train_loss,'-b', 
         label='Training')
plt.plot(np.arange(0,len(val_loss))*log_step*mb_size/train_set_size, val_loss,'-g', 
         label='Validation')
plt.legend(loc='lower right', frameon=False)
plt.ylim(ymax = 3.0, ymin = 0.0)
plt.ylabel('loss')
plt.xlabel('epoch');

In [None]:
## visualize weights

W_conv1_vis = W_conv1.eval();
print('W_conv1: min = ' + str(np.min(W_conv1_vis)) + ' max = ' + str(np.max(W_conv1_vis))
      + ' mean = ' + str(np.mean(W_conv1_vis)) + ' std = ' + str(np.std(W_conv1_vis)))
W_conv1_vis = np.reshape(W_conv1_vis,(s_f_conv1,s_f_conv1,1,6,6))
W_conv1_vis = np.transpose(W_conv1_vis,(3,0,4,1,2))
W_conv1_vis = np.reshape(W_conv1_vis,(s_f_conv1*6,s_f_conv1*6,1))
plt.gca().set_xticks(np.arange(-0.5, s_f_conv1*6, s_f_conv1), minor = True);
plt.gca().set_yticks(np.arange(-0.5, s_f_conv1*6, s_f_conv1), minor = True);
plt.grid(which = 'minor', color='b', linestyle='-', linewidth=1)
plt.title('W_conv1 ' + str(W_conv1.shape))
plt.colorbar(plt.imshow(W_conv1_vis[:,:,0], cmap=cm.binary));
plt.show();

W_conv2_vis = W_conv2.eval();
print('W_conv2: min = ' + str(np.min(W_conv2_vis)) + ' max = ' + str(np.max(W_conv2_vis))
      + ' mean = ' + str(np.mean(W_conv2_vis)) + ' std = ' + str(np.std(W_conv2_vis)))
W_conv2_vis = np.reshape(W_conv2_vis,(s_f_conv2,s_f_conv2,6,6,36))
W_conv2_vis = np.transpose(W_conv2_vis,(2,0,3,1,4))
W_conv2_vis = np.reshape(W_conv2_vis,(6*s_f_conv2,6*s_f_conv2,6,6))
W_conv2_vis = np.transpose(W_conv2_vis,(2,0,3,1))
W_conv2_vis = np.reshape(W_conv2_vis,(6*6*s_f_conv2,6*6*s_f_conv2))
plt.figure(figsize=(15,10))
plt.gca().set_xticks(np.arange(-0.5, 6*6*s_f_conv2, 6*s_f_conv2), minor = True);
plt.gca().set_yticks(np.arange(-0.5, 6*6*s_f_conv2, 6*s_f_conv2), minor = True);
plt.grid(which = 'minor', color='b', linestyle='-', linewidth=1)
plt.title('W_conv2 ' + str(W_conv2.shape))
plt.colorbar(plt.imshow(W_conv2_vis[:,:], cmap=cm.binary));

W_conv3_vis = W_conv3.eval();
print('W_conv3: min = ' + str(np.min(W_conv3_vis)) + ' max = ' + str(np.max(W_conv3_vis))
      + ' mean = ' + str(np.mean(W_conv3_vis)) + ' std = ' + str(np.std(W_conv3_vis)))
W_conv3_vis = np.reshape(W_conv3_vis,(s_f_conv3,s_f_conv3,6,6,36))
W_conv3_vis = np.transpose(W_conv3_vis,(2,0,3,1,4))
W_conv3_vis = np.reshape(W_conv3_vis,(6*s_f_conv3,6*s_f_conv3,6,6))
W_conv3_vis = np.transpose(W_conv3_vis,(2,0,3,1))
W_conv3_vis = np.reshape(W_conv3_vis,(6*6*s_f_conv3,6*6*s_f_conv3))
plt.figure(figsize=(15,10))
plt.gca().set_xticks(np.arange(-0.5, 6*6*s_f_conv3, 6*s_f_conv3), minor = True);
plt.gca().set_yticks(np.arange(-0.5, 6*6*s_f_conv3, 6*s_f_conv3), minor = True);
plt.grid(which = 'minor', color='b', linestyle='-', linewidth=1)
plt.title('W_conv3 ' + str(W_conv3.shape))
plt.colorbar(plt.imshow(W_conv3_vis[:,:], cmap=cm.binary));

#b_conv1_vis = b_conv1.eval();
#print('b_conv1 = ',b_conv1_vis)
#b_conv2_vis = b_conv2.eval();
#print('b_conv2 = ',b_conv2_vis)

In [None]:
## visualize activations

IMG_NO = 10;
feed_dict = {x: train_images[IMG_NO:IMG_NO+1], tf_keep_prob: 1.0}

# original image
plt.figure(figsize=(15,10))
plt.subplot(3,3,1)
plt.title('prediction: %d'%predict.eval(feed_dict = feed_dict))
plt.imshow(train_images[IMG_NO].reshape(image_width,image_height),cmap=cm.binary);

# 1. convolution
h_conv1_vis = h_conv1.eval(feed_dict = feed_dict);
plt.subplot(3,3,2)
plt.title('h_conv1 ' + str(h_conv1_vis.shape))
h_conv1_vis = np.reshape(h_conv1_vis,(-1,28,28,6,6))
h_conv1_vis = np.transpose(h_conv1_vis,(0,3,1,4,2))
h_conv1_vis = np.reshape(h_conv1_vis,(-1,6*28,6*28))
plt.imshow(h_conv1_vis[0], cmap=cm.binary);

# 1. max pooling
h_pool1_vis = h_pool1.eval(feed_dict = feed_dict);
plt.subplot(3,3,3)
plt.title('h_pool1 ' + str(h_pool1_vis.shape))
h_pool1_vis = np.reshape(h_pool1_vis,(-1,14,14,6,6))
h_pool1_vis = np.transpose(h_pool1_vis,(0,3,1,4,2))
h_pool1_vis = np.reshape(h_pool1_vis,(-1,6*14,6*14))
plt.imshow(h_pool1_vis[0], cmap=cm.binary);

# 2. convolution
h_conv2_vis = h_conv2.eval(feed_dict = feed_dict);
plt.subplot(3,3,4)
plt.title('h_conv2 ' + str(h_conv2_vis.shape))
h_conv2_vis = np.reshape(h_conv2_vis,(-1,14,14,6,6))
h_conv2_vis = np.transpose(h_conv2_vis,(0,3,1,4,2))
h_conv2_vis = np.reshape(h_conv2_vis,(-1,6*14,6*14))
plt.imshow(h_conv2_vis[0], cmap=cm.binary);

# 2. max pooling
h_pool2_vis = h_pool2.eval(feed_dict = feed_dict);
plt.subplot(3,3,5)
plt.title('h_pool2 ' + str(h_pool2_vis.shape))
h_pool2_vis = np.reshape(h_pool2_vis,(-1,7,7,6,6))
h_pool2_vis = np.transpose(h_pool2_vis,(0,3,1,4,2))
h_pool2_vis = np.reshape(h_pool2_vis,(-1,6*7,6*7))
plt.imshow(h_pool2_vis[0], cmap=cm.binary);

# 3. convolution
h_conv3_vis = h_conv3.eval(feed_dict = feed_dict);
plt.subplot(3,3,6)
plt.title('h_conv3 ' + str(h_conv3_vis.shape))
h_conv3_vis = np.reshape(h_conv3_vis,(-1,7,7,6,6))
h_conv3_vis = np.transpose(h_conv3_vis,(0,3,1,4,2))
h_conv3_vis = np.reshape(h_conv3_vis,(-1,6*7,6*7))
plt.imshow(h_conv3_vis[0], cmap=cm.binary);

# 3. max pooling
h_pool3_vis = h_pool3.eval(feed_dict = feed_dict);
plt.subplot(3,3,7)
plt.title('h_pool2 ' + str(h_pool3_vis.shape))
h_pool3_vis = np.reshape(h_pool3_vis,(-1,4,4,6,6))
h_pool3_vis = np.transpose(h_pool3_vis,(0,3,1,4,2))
h_pool3_vis = np.reshape(h_pool3_vis,(-1,6*4,6*4))
plt.imshow(h_pool3_vis[0], cmap=cm.binary);

# 4. FC layer
h_fc1_vis = h_fc1.eval(feed_dict = feed_dict);
plt.subplot(3,3,8)
plt.title('h_fc1 ' + str(h_fc1_vis.shape))
h_fc1_vis = np.reshape(h_fc1_vis,(-1,24,24))
plt.imshow(h_fc1_vis[0], cmap=cm.binary);
plt.show()

# 5. FC layer
h_fc2_vis = y.eval(feed_dict = feed_dict);
np.set_printoptions(precision=2)
print('h_fc2 = ', h_fc2_vis)

In [None]:
## show misclassified images
if val_set_size > 0:
    y_val_predict = sess.run(tf.argmax(y,1), feed_dict={x: val_images, tf_keep_prob: 1.0});
    y_val_target = sess.run(tf.argmax(val_labels,1));
    
    y_val_false_index = []
    for i in range(y_val_target.shape[0]):
        if y_val_predict[i] != y_val_target[i]:
            y_val_false_index.append(i)

    print('# false predictions: ', len(y_val_false_index))

    plt.figure(figsize=(10,15))
    for j in range(0,5):
        for i in range(0,10):
            if j*10+i<len(y_val_false_index):
                plt.subplot(10,10,j*10+i+1)
                plt.title('%d/%d'%(y_val_target[y_val_false_index[j*10+i]],
                                   y_val_predict[y_val_false_index[j*10+i]]))
                plt.imshow(val_images[y_val_false_index[j*10+i]].reshape(28,28),cmap=cm.binary)    

## 5. Testing
- predict and submit the results for the test set

In [None]:
# read test data from CSV file 
if os.path.isfile('../input/test.csv'):
    test_data = pd.read_csv('../input/test.csv') # on kaggle 
    print('test.csv loaded: test_data{0}'.format(test_data.shape))
elif os.path.isfile('data/test.csv'):
    test_data = pd.read_csv('data/test.csv') # on local environment
    print('test.csv loaded: test_data{0}'.format(test_data.shape))
else:
    print('Error: test.csv not found')
    
test_images = test_data.iloc[:,0:].values.reshape(-1,28,28,1) # (28000,28,28,1) array
test_images = test_images.astype(np.float)
test_images = np.multiply(test_images, 1.0 / 255.0) # convert from [0:255] => [0.0:1.0]
print('read: test_images{0}'.format(test_images.shape));

# using batches is more resource efficient
predicted_labels = np.zeros(test_images.shape[0])
batch_size = 1000;
for i in range(0,int(test_images.shape[0]/batch_size)):
    predicted_labels[i*batch_size:(i+1)*batch_size] = predict.eval(
        feed_dict={x: test_images[i*batch_size:(i+1)*batch_size], tf_keep_prob: 1.0})
print('compute predicted_labels({0})'.format(len(predicted_labels)))

# save predictions
np.savetxt('submission.csv', 
           np.c_[range(1,len(test_images)+1),predicted_labels], 
           delimiter=',', 
           header = 'ImageId,Label', 
           comments = '', 
           fmt='%d')

print('saved: submission.csv');

In [None]:
# look at test images and predicted labels
plt.figure(figsize=(10,15))
for j in range(0,5):
    for i in range(0,10):
        plt.subplot(10,10,j*10+i+1)
        plt.title('%d'%predicted_labels[j*10+i])
        plt.imshow(test_images[j*10+i].reshape(28,28),cmap=cm.binary)


In [None]:
sess.close()