# ResNet for CIFAR 10

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from keras.layers import Dense, Conv2D, BatchNormalization, Activation
from keras.layers import AveragePooling2D, Input, Flatten
from keras.optimizers import Adam
from keras.models import Model
from keras.regularizers import l2

In [3]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler
from keras.callbacks import ReduceLROnPlateau

In [4]:
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K

In [5]:
from keras.datasets import cifar10

In [6]:
import numpy as np
import os

### Hyper-parameter

In [7]:
# Training params
batch_size = 32
epochs = 50
data_augmentation = False
num_classes = 10

### Load CIFAR 10 and do data arrangment

In [8]:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# normalize the pixel value between 0-1 to speed-up training
x_train = x_train.astype('float32')/255
x_test = x_test.astype('float32')/255

# mean center (with the train mean)
x_train_mean = np.mean(x_train)
x_train -= x_train_mean
x_test -= x_train_mean

# trace
print('x_train shape: ', x_train.shape)
print('x_test shpae: ', x_test.shape)

# one-hot for the class label
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train shape:  (50000, 32, 32, 3)
x_test shpae:  (10000, 32, 32, 3)


In [9]:
input_shape= x_train.shape[1:] # (32,32,3)

## Model

### learning rate

In [10]:
def lr_schedule(epoch):
    '''
    learning rate which adjust based on epoch
    Init with larger learning rate, decreasing while epoch increasing
    '''
    lr = 1e-3 # 0.001
    if epoch > 180:
        lr *= 0.5e-3
    elif epoch > 160:
        lr *= 1e-3
    elif epoch > 120:
        lr *= 1e-2
    elif epoch > 80:
        lr *= 1e-1
    
    print('Learning rate: ', lr)
    return lr

### ResNet layer

In [11]:
def resnet_layer(inputs,
                 num_filters=16,
                 kernel_size=3,
                 strides=(1,1),
                 activation='relu',
                 batch_normalization = True,
                 conv_first = True):  # ResNet v1: conv_bn_acti / ResNet v2: batch_acti_conv
    '''
    Helper function for building ResNet
    '''
    
    # keras-conv2d
    conv = Conv2D(filters=num_filters,
                  kernel_size=kernel_size,
                  strides=strides,
                  padding='same',
                  kernel_initializer='he_normal',  # TODO: understand what is this
                  kernel_regularizer=l2(1e-4))
    
    x = inputs
    if conv_first:
        x = conv(x)  # conv at the beginning
        if batch_normalization:
            x = BatchNormalization()(x)
        if activation is not None:
            x = Activation(activation)(x)
    else:
        if batch_normalization:
            x = BatchNormalization()(x)
        if activation is not None:
            x = Activation(activation)(x)
        x = conv(x) # conv at the end
        
    return x

In [12]:
# paper restnet 20
def resnet_v1(input_shape,
              depth, 
              num_classes=10):
    '''
    Conv-BN-Relu
    
    Stacks of 2 x (3 x 3) Conv2D-BN-Relu
    Last Relu is after the shortcut connection.
    
    At the beginning of each stage, the feature map size is halved (downsampled)
    by stride 2 conv layer, while the filter number doubled.
    
    
    Stage    Feature map size  Filter number
    --------|-----------------|-------------
    stage 0:        32x32            16
    stage 1:        16x16            32
    stage 2:         8x8             64
    '''
    # Assertion
    if (depth - 2) % 6 != 0:
        raise ValueError('depth should be 6n2 e.g. 20, 32, 44')
    
    # Start construct models
    num_filters = 16
    num_res_blocks = int((depth - 2) / 6)
    inputs = Input(shape=input_shape)
    
    # first layer
    x = resnet_layer(inputs=inputs)
    
    # recursively stack up of residual units
    for stage in range(3):
        for res_block in range(num_res_blocks):
            strides = 1
            if(stage > 0 and res_block == 0): # Except from first stage, the other stage, first block need to downsample
                strides = 2  # down-sampling
            
            l1 = resnet_layer(inputs=x,
                             num_filters=num_filters,
                             strides=strides)  # only the l1 needs to consider stride
                                               # only the l1 needs activation
            l2 = resnet_layer(inputs=l1,
                             num_filters=num_filters,
                             activation=None)  # l2 does not need activation
            
            if(stage > 0 and res_block == 0): # When downsampling, need to make sure the shortcut dimension is aligned
                # handling the dimension for the input -> 1x1 conv, not bn! no activation!!!
                x = resnet_layer(inputs=x,
                                 num_filters=num_filters,
                                 kernel_size=1, # 1x1 conv, just change the depth
                                 strides=strides,
                                 activation=None,
                                 batch_normalization=False)
            
            # short-cut connect!
            x = keras.layers.add([l2,x]) 
            x = Activation('relu')(x)
        
        num_filters *= 2  # filter size doulbe for each stage
    
    
    # V1 does not use BN after last shortcut connection-Relu
    x = AveragePooling2D(pool_size=8)(x)
    flat_x = Flatten()(x)
    outputs = Dense(num_classes,
                    activation='softmax', 
                    kernel_initializer='he_normal')(flat_x)
    
    # Model
    model = Model(inputs=inputs, outputs=outputs)
    return model
    
    

In [13]:
# resnet 18
def resnet_v1_18(input_shape,
                 num_classes=10):
    '''
    Conv-BN-Relu
    
    Stacks of Stage and res_block
    Last Relu is after the shortcut connection.
    
    At the beginning of each stage, the feature map size is halved (downsampled)
    by stride 2 conv layer, while the filter number doubled.
    
    
    Stage    Feature map size  Filter number
    --------|-----------------|-------------
    stage 0:        32x32            64
    stage 1:        16x16           128
    stage 2:          8x8           256
    stage 3:          4x4           512
    '''
    num_filters = 64 
    num_res_blocks = 2
    num_stages = 4
    
    inputs = Input(shape=input_shape)
    
    # first layer (32x32x64)
    x = resnet_layer(inputs=inputs,
                     num_filters=64)
    
    for stage in range(num_stages):
        for res_block in range(num_res_blocks):
            strides = (1,1)
            
            if(stage > 0 and res_block == 0): # Except the first stage, when meet first res_block, need to down-sammpling
                strides = (2,2)
                print('Down-sampling')
            
            l1 = resnet_layer(inputs=x,
                              num_filters=num_filters,
                              strides=strides) # only the first layer need to worry about strides
            
            l2 = resnet_layer(inputs=l1,
                              num_filters=num_filters,
                              activation=None) # relu will happen after adding shortcut
            
            if(stage > 0 and res_block == 0): # Except the first stage, when meet first res_block, need to handling the dimension change (H,W and Depth!!!)
                # over-write x
                x = resnet_layer(inputs=x,
                                 num_filters=num_filters,
                                 kernel_size=1, # 1x1 conv  -> depth change
                                 strides=strides, # 2x2 strides -> H,W change
                                 activation=None, # relu will happend after adding shortcut 
                                 batch_normalization=False) 
            print('Stage: {}, block: {}, l2 shape {}, x shape {}'.format(stage, res_block, l2.shape[1:], x.shape[1:]))
            
            # adding short-cut, over-write x for the new loop
            x = keras.layers.add([l2,x])
            x = Activation('relu')(x) # now activation
            
        num_filters *= 2 # increment filter size after each stage
    
    # layer 17 (4x4x512)
    # final_layer 18 (1x1x512)
    x = AveragePooling2D(pool_size=4)(x)
    flat_x = Flatten()(x)
    
    # output
    output = Dense(units=num_classes,
                   activation='softmax',
                   kernel_initializer='he_normal')(flat_x)
    
    # Model
    model = Model(inputs=inputs, outputs=output)
    return model
    

# Pipeline

In [14]:
model = resnet_v1_18(input_shape=input_shape)

Stage: 0, block: 0, l2 shape (32, 32, 64), x shape (32, 32, 64)
Stage: 0, block: 1, l2 shape (32, 32, 64), x shape (32, 32, 64)
Down-sampling
Stage: 1, block: 0, l2 shape (16, 16, 128), x shape (16, 16, 128)
Stage: 1, block: 1, l2 shape (16, 16, 128), x shape (16, 16, 128)
Down-sampling
Stage: 2, block: 0, l2 shape (8, 8, 256), x shape (8, 8, 256)
Stage: 2, block: 1, l2 shape (8, 8, 256), x shape (8, 8, 256)
Down-sampling
Stage: 3, block: 0, l2 shape (4, 4, 512), x shape (4, 4, 512)
Stage: 3, block: 1, l2 shape (4, 4, 512), x shape (4, 4, 512)


## model summary

In [15]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=lr_schedule(0)), # first epoch
              metrics=['accuracy'])
model.summary()

Learning rate:  0.001
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 32, 32, 3)    0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 32, 32, 64)   1792        input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 32, 32, 64)   256         conv2d_1[0][0]                   
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 32, 32, 64)   0           batch_normalization_1[0][0]      
______________________________________________________________________

In [16]:
# saving dir
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name='cifar10-model-resenet18'
if not os.path.isdir(save_dir):
    os.makedis(save_dir)
filepath = os.path.join(save_dir, model_name)

In [17]:
# checkpoint
checkpoint = ModelCheckpoint(filepath=filepath,
                             monitor='val_acc',
                             verbose=1,
                             save_best_only=True)

lr_scheduler = LearningRateScheduler(lr_schedule)
lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                               cooldown=0,
                               patience=5,
                               min_lr=0.5e-6)
callbacks = [checkpoint, lr_reducer, lr_scheduler]

In [18]:
# Run training, with or without data augmentation.
if not data_augmentation:
    print('Not using data augmentation.')
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_test, y_test),
              shuffle=True,
              callbacks=callbacks)
else:
    print('Using real-time data augmentation.')
    # This will do preprocessing and realtime data augmentation:
    datagen = ImageDataGenerator(
        # set input mean to 0 over the dataset
        featurewise_center=False,
        # set each sample mean to 0
        samplewise_center=False,
        # divide inputs by std of dataset
        featurewise_std_normalization=False,
        # divide each input by its std
        samplewise_std_normalization=False,
        # apply ZCA whitening
        zca_whitening=False,
        # epsilon for ZCA whitening
        zca_epsilon=1e-06,
        # randomly rotate images in the range (deg 0 to 180)
        rotation_range=0,
        # randomly shift images horizontally
        width_shift_range=0.1,
        # randomly shift images vertically
        height_shift_range=0.1,
        # set range for random shear
        shear_range=0.,
        # set range for random zoom
        zoom_range=0.,
        # set range for random channel shifts
        channel_shift_range=0.,
        # set mode for filling points outside the input boundaries
        fill_mode='nearest',
        # value used for fill_mode = "constant"
        cval=0.,
        # randomly flip images
        horizontal_flip=True,
        # randomly flip images
        vertical_flip=False,
        # set rescaling factor (applied before any other transformation)
        rescale=None,
        # set function that will be applied on each input
        preprocessing_function=None,
        # image data format, either "channels_first" or "channels_last"
        data_format=None,
        # fraction of images reserved for validation (strictly between 0 and 1)
        validation_split=0.0)

    # Compute quantities required for featurewise normalization
    # (std, mean, and principal components if ZCA whitening is applied).
    datagen.fit(x_train)

    # Fit the model on the batches generated by datagen.flow().
    model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
                        validation_data=(x_test, y_test),
                        epochs=epochs, verbose=1, workers=4,
                        callbacks=callbacks)

# Score trained model.
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

Not using data augmentation.
Train on 50000 samples, validate on 10000 samples
Epoch 1/50
Learning rate:  0.001
Epoch 2/50
Learning rate:  0.001
   96/50000 [..............................] - ETA: 1:37 - loss: 1.2652 - accuracy: 0.7500



Epoch 3/50
Learning rate:  0.001
Epoch 4/50
Learning rate:  0.001
Epoch 5/50
Learning rate:  0.001
Epoch 6/50
Learning rate:  0.001
Epoch 7/50
Learning rate:  0.001
Epoch 8/50
Learning rate:  0.001
Epoch 9/50
Learning rate:  0.001
Epoch 10/50
Learning rate:  0.001
Epoch 11/50
Learning rate:  0.001
Epoch 12/50
Learning rate:  0.001
Epoch 13/50
Learning rate:  0.001
Epoch 14/50
Learning rate:  0.001
Epoch 15/50
Learning rate:  0.001
Epoch 16/50
Learning rate:  0.001
Epoch 17/50
Learning rate:  0.001
Epoch 18/50
Learning rate:  0.001
Epoch 19/50
Learning rate:  0.001
Epoch 20/50
Learning rate:  0.001
Epoch 21/50
Learning rate:  0.001
Epoch 22/50
Learning rate:  0.001
Epoch 23/50
Learning rate:  0.001
Epoch 24/50
Learning rate:  0.001
Epoch 25/50
Learning rate:  0.001
Epoch 26/50
Learning rate:  0.001
Epoch 27/50
Learning rate:  0.001
Epoch 28/50
Learning rate:  0.001
Epoch 29/50
Learning rate:  0.001
Epoch 30/50
Learning rate:  0.001
Epoch 31/50
Learning rate:  0.001
Epoch 32/50
Learning 

Test loss: 1.1106641494750977
Test accuracy: 0.7954000234603882
