# 3. CNN

## Run name

In [1]:
import time

project_name = 'DigitRecognizer'
step_name = 'Preprocess'
date_str = time.strftime("%Y%m%d", time.localtime())
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = '%s_%s_%s' % (project_name, step_name, time_str)
print('run_name: %s' % run_name)
t0 = time.time()

run_name: DigitRecognizer_Preprocess_20190407_220457


## Important Params

In [2]:
from multiprocessing import cpu_count

batch_size = 8
random_state = 2019

print('cpu_count:\t', cpu_count())
print('batch_size:\t', batch_size)
print('random_state:\t', random_state)


cpu_count:	 4
batch_size:	 8
random_state:	 2019


## Import PKGs

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import gc
import math
import shutil
import zipfile
import pickle
import h5py
from PIL import Image

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [4]:
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler, TensorBoard

Using TensorFlow backend.


## Basic folders

In [5]:
cwd = os.getcwd()
input_folder = os.path.join(cwd, 'input')
log_folder = os.path.join(cwd, 'log')
model_folder = os.path.join(cwd, 'model')
output_folder = os.path.join(cwd, 'output')
print('input_folder: \t\t%s' % input_folder)
print('log_folder: \t\t%s' % log_folder)
print('model_folder: \t\t%s' % model_folder)
print('output_folder: \t\t%s'% output_folder)

train_csv_file = os.path.join(input_folder, 'train.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
print('\ntrain_csv_file: \t%s' % train_csv_file)
print('test_csv_file: \t\t%s' % test_csv_file)

processed_data_file = os.path.join(input_folder, '%s_%s.p' % (project_name, step_name))
print('processed_data_file: \t%s' % processed_data_file)

input_folder: 		D:\Kaggle\digit-recognizer\input
log_folder: 		D:\Kaggle\digit-recognizer\log
model_folder: 		D:\Kaggle\digit-recognizer\model
output_folder: 		D:\Kaggle\digit-recognizer\output

train_csv_file: 	D:\Kaggle\digit-recognizer\input\train.csv
test_csv_file: 		D:\Kaggle\digit-recognizer\input\test.csv
processed_data_file: 	D:\Kaggle\digit-recognizer\input\DigitRecognizer_Preprocess.p


## Basic functions

In [6]:
def show_data_images(rows, fig_column, y_data, *args):
    columns = len(args)
    figs, axes = plt.subplots(rows, columns, figsize=(rows, fig_column*columns))
    print(axes.shape)
    for i, ax in enumerate(axes):
        y_data_str = ''
        if type(y_data) != type(None):
            y_data_str =  '_' + str(y_data[i])
        ax[0].set_title('28x28' + y_data_str)
        for j, arg in enumerate(args):
            ax[j].imshow(arg[i])

## Preview data

In [7]:
%%time
raw_data = np.loadtxt(train_csv_file, skiprows=1, dtype='int', delimiter=',')
x_data = raw_data[:,1:]
y_data = raw_data[:,0]

x_test = np.loadtxt(test_csv_file, skiprows=1, dtype='int', delimiter=',')

print(x_data.shape)
print(y_data.shape)
print(x_test.shape)

(42000, 784)
(42000,)
(28000, 784)
Wall time: 34.3 s


In [8]:
x_data = x_data/255.
x_test = x_test/255.
y_data_cat = to_categorical(y_data)

describe(x_data)
describe(x_test)
describe(y_data)
describe(y_data_cat)

x_data = x_data.reshape(-1, 28, 28, 1)
x_test = x_test.reshape(-1, 28, 28, 1)

describe(x_data)
describe(x_test)

# print(x_data[0])
print(y_data[0: 10])

NameError: name 'x_data' is not defined

In [None]:
index = 0
fig, ax = plt.subplots(2, 2, figsize=(12, 6))
ax[0, 0].plot(x_data[index].reshape(784,))
ax[0, 0].set_title('784x1 data')
ax[0, 1].imshow(x_data[index].reshape(28, 28), cmap='gray')
ax[0, 1].set_title('28x28 data => ' + str(y_data[index]))

ax[1, 0].plot(x_test[index].reshape(784,))
ax[1, 0].set_title('784x1 data')
ax[1, 1].imshow(x_test[index].reshape(28, 28), cmap='gray')
ax[1, 1].set_title('28x28 data')

## Split train and val

In [None]:
x_train, x_val, y_train_cat, y_val_cat = train_test_split(x_data, y_data_cat, test_size=0.1, random_state=random_state)

print(x_train.shape)
print(y_train_cat.shape)
print(x_val.shape)
print(y_val_cat.shape)

## Build model

In [None]:
def build_model(input_shape):
    model = Sequential()
    # Block 1
    model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu', padding = 'Same', input_shape = input_shape))
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu', padding = 'Same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(strides=(2,2)))
    model.add(Dropout(0.25))
    # Block 2
    model.add(Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding = 'Same'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters = 64, kernel_size = (3, 3), activation='relu', padding = 'Same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(strides=(2,2)))
    model.add(Dropout(0.25))

    # Output
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(10, activation='softmax'))
    
    return model

In [None]:
model = build_model(x_train.shape[1:])
model.compile(loss='categorical_crossentropy', optimizer = Adam(lr=1e-4), metrics=["accuracy"])

In [None]:
train_datagen = ImageDataGenerator(
    zoom_range = 0.2,
    rotation_range = 20,
    height_shift_range = 0.2,
    width_shift_range = 0.2
)

val_datagen = ImageDataGenerator()

In [None]:
# annealer = LearningRateScheduler(lambda x: 1e-4 * 0.995 ** x)

def get_lr(x):
    if x <= 10:
        return 1e-4
    elif x <= 20:
        return 3e-5
    else:
        return 1e-5
[print(get_lr(x), end=' ') for x in range(1, 31)]

annealer = LearningRateScheduler(get_lr)
callbacks = [annealer]

In [None]:
%%time
steps_per_epoch = x_train.shape[0] / batch_size
print('steps_per_epoch:\t', steps_per_epoch)

hist = model.fit_generator(
    train_datagen.flow(x_train, y_train_cat, batch_size=batch_size, seed=random_state),
    steps_per_epoch=steps_per_epoch,
    epochs=2, #Increase this when not on Kaggle kernel
    verbose=1,  #1 for ETA, 0 for silent
    callbacks=callbacks,
    max_queue_size=batch_size*4,
    workers=cpu_count(),
    validation_steps=100,
    validation_data=val_datagen.flow(x_val, y_val_cat, batch_size=batch_size, seed=random_state)
)