# Object Detection with Street View House Numbers

In [25]:
import pandas as pd
import numpy as np
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Input, Dropout, BatchNormalization, Activation

## Best Architecture

[Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks](https://arxiv.org/abs/1312.6082), Goodfellow, et al, 2014

- eight convolutional hidden layers, 
- one locally connected hidden layer
- two densely connected hidden layers. 
- the first hidden layer contains maxout units with three filters per unit
- the others contain rectifier units 
- the number of units is [48, 64, 128, 160] for the first four layers 
- 192 for all other locally connected layers
- the fully connected layers contain 3,072 units each. 
- Each convolutional layer includes max pooling and subtractive normalization
- The max pooling window size is 2 × 2. 
- The stride alternates between 2 and 1 at each layer, so that half of the layers don’t reduce the spatial size of the representation
- All convolutions use zero padding on the input to preserve representation size. 
- The subtractive normalization operates on 3x3 windows and preserves representation size. 
- All convolution kernels were of size 5 × 5. 
- We trained with dropout applied to all hidden layers but not the input.

In [29]:
def svhn_layer(model, filters, strides, n, input_shape=None):
    if input_shape is not None:
        model.add(Conv2D(filters, kernel_size=5, padding='same', name='CONV{}'.format(n), input_shape=input_shape))
    else:
        model.add(Conv2D(filters, kernel_size=5, padding='same', activation='relu', name='CONV{}'.format(n)))
    model.add(BatchNormalization(name='NORM{}'.format(n)))
    model.add(MaxPooling2D(pool_size=2, strides=strides, name='POOL{}'.format(n)))
    model.add(Dropout(0.2, name='DROP{}'.format(n)))
    return model

In [30]:
model = Sequential()

svhn_layer(model, 48, 1, n=1, input_shape=(32,32,1))

for i, kernel in enumerate([48, 64, 128, 160] + 3 * [192], 2):
    svhn_layer(model, kernel, strides=2 if i % 2 == 0 else 1, n=i)

model.add(Flatten())
model.add(Dense(3072, name='FC1'))
model.add(Dense(3072, name='FC2'))
y = model.output

n_digits = (Dense(units=6, activation='softmax'))(y)
digit1 = (Dense(units=10, activation='softmax'))(y)
digit2 = (Dense(units=11, activation='softmax'))(y)
digit3 = (Dense(units=11, activation='softmax'))(y)
digit4 = (Dense(units=11, activation='softmax'))(y)
digit5 = (Dense(units=11, activation='softmax'))(y)

svhn_model = Model(inputs=model.input, outputs=[n_digits, digit1, digit2, digit3, digit4, digit5])

In [31]:
svhn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
CONV1_input (InputLayer)        (None, 32, 32, 1)    0                                            
__________________________________________________________________________________________________
CONV1 (Conv2D)                  (None, 32, 32, 48)   1248        CONV1_input[0][0]                
__________________________________________________________________________________________________
NORM1 (BatchNormalization)      (None, 32, 32, 48)   192         CONV1[0][0]                      
__________________________________________________________________________________________________
POOL1 (MaxPooling2D)            (None, 31, 31, 48)   0           NORM1[0][0]                      
__________________________________________________________________________________________________
DROP1 (Dro

### Get Data 

In [32]:
svhn_model.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=["accuracy"])

In [33]:
with pd.HDFStore('images/svhn/data.h5') as store:
    X_train = store['train/data'].values.reshape(-1, 32, 32, 1)
    y_train = store['train/labels']
    X_test = store['test/data'].values.reshape(-1, 32, 32, 1)
    y_test = store['test/labels']   

In [34]:
train_digits = [to_categorical(d) for d in y_train.values.T]
test_digits = [to_categorical(d) for d in y_test.values.T]

In [35]:
svhn_path = 'models/svhn.cnn.weights.best.hdf5'

In [36]:
checkpointer = ModelCheckpoint(filepath=svhn_path, 
                               verbose=1, 
                               save_best_only=True)

In [None]:
nb_epoch = 25
svhn_model.fit(x=X_train,
               y=train_digits,
               batch_size=32,
               nb_epoch=nb_epoch,
               verbose=1,
               validation_data=(X_test, test_digits))

  import sys


Train on 33401 samples, validate on 13068 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
 4096/33401 [==>...........................] - ETA: 8:03 - loss: 7.6028 - dense_7_loss: 0.9176 - dense_8_loss: 2.5801 - dense_9_loss: 2.3857 - dense_10_loss: 1.1713 - dense_11_loss: 0.5436 - dense_12_loss: 0.0044 - dense_7_acc: 0.7244 - dense_8_acc: 0.2800 - dense_9_acc: 0.2139 - dense_10_acc: 0.6858 - dense_11_acc: 0.9497 - dense_12_acc: 0.9995

In [10]:
n_digits, digit1, digit2, digit3, digit4, digit5 = svhn_model.predict(X_test, verbose=1)



In [15]:
(y_test[0] == np.argmax(n_digits, axis=1)).sum()/len(n_digits)

0.7210743801652892

In [21]:
confusion_matrix(y_true=y_test[0], y_pred=np.argmax(n_digits, axis=1))

array([[1082, 1394,    7,    0,    0],
       [ 340, 7755,  261,    0,    0],
       [  19, 1476,  586,    0,    0],
       [   0,   78,   68,    0,    0],
       [   0,    2,    0,    0,    0]])

In [22]:
confusion_matrix(y_true=y_test[1], y_pred=np.argmax(digit1, axis=1))

array([[   0,   18,    1,    0,    0,    0,    0,    0,    0,    0],
       [   0, 3565,   21,    4,    0,    0,    0,  104,    0,    0],
       [   0, 2593,   10,    2,    0,    0,    0,   52,    0,    0],
       [   0, 1581,    6,    0,    0,    0,    0,   38,    0,    0],
       [   0, 1208,    5,    0,    0,    0,    0,   23,    0,    0],
       [   0, 1030,    5,    0,    0,    0,    0,   17,    0,    0],
       [   0,  841,    5,    1,    0,    0,    0,   10,    0,    0],
       [   0,  706,    0,    0,    0,    0,    0,   41,    0,    0],
       [   0,  615,    2,    0,    0,    0,    0,    8,    0,    0],
       [   0,  547,    0,    0,    0,    0,    0,    9,    0,    0]])

In [23]:
pd.Series(np.argmax(digit1, axis=1)).value_counts()

1    12704
7      302
2       55
3        7
dtype: int64

In [18]:
y_test[0].value_counts(normalize=True)

2    0.639425
1    0.190006
3    0.159244
4    0.011172
5    0.000153
Name: 0, dtype: float64