## Keras cross validation with Sklearn

http://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from PIL import Image
from skimage.transform import resize
from skimage.io import imread, imsave
from sklearn.preprocessing import LabelEncoder
import keras.utils.np_utils as kutils

Using TensorFlow backend.


In [2]:
trainLabels = pd.read_csv('trainLabels.csv')
trainLabels.head(5)

Unnamed: 0,ID,Class
0,1,n
1,2,8
2,3,T
3,4,I
4,5,R


In [3]:
def plot_sample(x, axis, title):
    # img = x.reshape(96,96)
    axis.imshow(x, cmap='gray')
    axis.axis('off')
    axis.set_title(title)

def draw_multi_plot(X, row_num=4, col_num=4, fig_size=(15,10)):
    fig, axes = plt.subplots(nrows=row_num, ncols=col_num, figsize=fig_size)
    i=0
    for row in axes:
        for each_ax in row:
            plot_sample(X[i], each_ax, i)
            i += 1
    plt.tight_layout()

In [4]:
trainX = []
trainY_array = []
trainFiles = glob.glob("myTrainResized/*")
for i, nameFile in enumerate(trainFiles):
    # print(nameFile)
    id_col = int(nameFile.split('/')[-1].split('.')[0])
    trainY_array.append(trainLabels[trainLabels['ID'] == id_col]['Class'].item())
    image = imread(nameFile)
    trainX.append(image)
trainX = np.array(trainX)
print("trainX.shape", trainX.shape)
trainY = np.array(trainY_array)
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(trainY)
dummy_Y = kutils.to_categorical(encoded_Y)
print("dummy_Y.shape", dummy_Y.shape)

trainX.shape (6283, 20, 20, 3)
dummy_Y.shape (6283, 62)


In [5]:
def make_config(img_row, img_col, filter_size=3, epoch=1, batch_size=128, layer_filter1=32, layer_filter2=64, layer_filter3=128):
    config_dict = {
        "epoch":epoch,
        "batch_size":batch_size,
        "img_row":img_row,
        "img_col":img_col,
        "filter_size":filter_size,
        "filter1":layer_filter1,
        "filter2":layer_filter2,
        "filter3":layer_filter3,
    }
    return config_dict
config = make_config(20, 20, layer_filter1=64, layer_filter2=128)
config

{'batch_size': 128,
 'epoch': 1,
 'filter1': 64,
 'filter2': 128,
 'filter3': 128,
 'filter_size': 3,
 'img_col': 20,
 'img_row': 20}

In [6]:
class_size = dummy_Y.shape[1]
class_size

62

In [7]:
import keras.models as kmodel
import keras.layers.convolutional as conv
import keras.layers.core as core
from keras.wrappers.scikit_learn import KerasClassifier

In [8]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [9]:
def baseline_model():
    cnn1 = kmodel.Sequential()
    cnn1.add(conv.Convolution2D(config['filter1'], config['filter_size'], config['filter_size'], 
                                activation='relu', input_shape=(config['img_row'], config['img_col'], 3), border_mode='same'))
    cnn1.add(conv.Convolution2D(config['filter1'], config['filter_size'], config['filter_size'], activation='relu', border_mode='same'))
    cnn1.add(conv.MaxPooling2D(strides=(2,2))) # 2*2칸중 가장 큰 값으로 정한다

    # L2
    cnn1.add(conv.Convolution2D(config['filter2'], config['filter_size'], config['filter_size'], activation='relu', border_mode='same'))
    cnn1.add(conv.Convolution2D(config['filter2'], config['filter_size'], config['filter_size'], activation='relu', border_mode='same'))
    cnn1.add(conv.MaxPooling2D(strides=(2,2)))

    cnn1.add(core.Flatten()) # 내용물을 1차원으로 쭉 피는 역할을 한다
    cnn1.add(core.Dense(500, activation='relu')) # Fully Connected Layer. 뉴런의 갯수는 500로 한다.
    cnn1.add(core.Dense(class_size, activation='softmax')) # Fully Connected Softmax 로 마지막에 클래스별 확률값을 구해낸다.
    
    cnn1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return cnn1

In [10]:
clf = KerasClassifier(build_fn=baseline_model, nb_epoch=5, batch_size=config['batch_size'], verbose=1)
clf

<keras.wrappers.scikit_learn.KerasClassifier at 0x117104a90>

In [11]:
#from sklearn.model_selection import cross_val_score 0.18 버전 이상
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import KFold

https://github.com/scikit-learn/scikit-learn/issues/6161

In [12]:
kfold = KFold(n=trainX.shape[0], n_folds=3, shuffle=True, random_state=seed)
kfold

sklearn.cross_validation.KFold(n=6283, n_folds=3, shuffle=True, random_state=7)

* http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html v0.18
* http://lijiancheng0614.github.io/scikit-learn/modules/generated/sklearn.cross_validation.KFold.html v0.17

In [13]:
results = cross_val_score(clf, trainX, dummy_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Baseline: 31.63% (12.37%)
