### 迁移学习

- 通过运行上方的代码，大概知道了各个模型的一个表现情况，因为VGG19的表现相对较差而，模型较大，训练起来会偏慢，所以放弃这个模型；
- 下面需要对剩下的三个模型进行迁移学习，首先我们获取三个模型的base部分，即不包含最后全连接层的模型；
- 然后我们用这些基础模型根据data generator来做预测得到3个模型的bottleneck_feature

In [20]:
from keras.models import *
from keras.layers import *
from keras.applications import *
from keras.preprocessing.image import *
import h5py
import numpy as np
import cv2
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline 

# save features
def save_bottle_neck_features(filename, train_features, test_features, train_labels):
    with h5py.File("bottleneck_features/{}.hdf5".format(filename), "w") as f:
        dset = f.create_dataset("train", data=train_features)
        dset = f.create_dataset("test", data=test_features)
        dset = f.create_dataset("label", data=train_labels)
    

In [16]:
# 定义input_tensor
input_tensor = Input(shape=(224, 224, 3))
# 获取base model
resnet_base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=input_tensor)
# 加一层pooling，不然数据太大了！而且可以防止拟合
model = Model(resnet_base_model.input, GlobalAveragePooling2D()(resnet_base_model.output))
# 记录当前时间
start_time = datetime.now()

train_data_path = 'data/train/'
test_data_path = 'data/test/'
train_data_size = 25000
test_data_size = 12500
image_size = (224, 224)

generator = ImageDataGenerator()

train_generator = generator.flow_from_directory(train_data_path, image_size, shuffle=False, batch_size=16)
test_generator = generator.flow_from_directory(test_data_path, image_size, shuffle=False, batch_size=16)

train_bottleneck_features = model.predict_generator(train_generator, train_data_size)
test_bottleneck_features = model.predict_generator(train_generator, test_data_size)

save_bottle_neck_features("restnet_bottleneck_features", train_bottleneck_features, test_bottleneck_features, 
                          train_generator.classes)
    
end_time = datetime.now()

print("Extract feature finished! Total consumed: {} ".format(end_time - start_time))

Found 25000 images belonging to 2 classes.
Found 0 images belonging to 0 classes.
Extrac feature finished! Total consumed: 0:57:18.320957 


In [24]:
# 定义input_tensor
input_tensor = Input(shape=(299, 299, 3))
# 预处理数据
input_tensor = Lambda(xception.preprocess_input)(input_tensor)
# 获取xception base model
xception_base_model = Xception(weights='imagenet', include_top=False, input_tensor=input_tensor)
model = Model(xception_base_model.input, GlobalAveragePooling2D()(xception_base_model.output))
# 重新定义image_size
image_size = (299, 299)
# 记录当前时间
start_time = datetime.now()

train_generator = generator.flow_from_directory(train_data_path, image_size, shuffle=False, batch_size=16)
test_generator = generator.flow_from_directory(test_data_path, image_size, shuffle=False, batch_size=16)

train_bottleneck_features = model.predict_generator(train_generator, train_data_size)
test_bottleneck_features = model.predict_generator(train_generator, test_data_size)

save_bottle_neck_features("xception_bottleneck_features", train_bottleneck_features, test_bottleneck_features, 
                          train_generator.classes)
    
end_time = datetime.now()

print("Extract feature finished! Total consumed: {} ".format(end_time - start_time))

Found 25000 images belonging to 2 classes.
Found 0 images belonging to 0 classes.
Extract feature finished! Total consumed: 1:51:56.502403 


In [25]:
# 定义input_tensor
input_tensor = Input(shape=(299, 299, 3))
# 预处理数据
input_tensor = Lambda(inception_v3.preprocess_input)(input_tensor)
# 获取inception base model
inception_base_model = InceptionV3(weights='imagenet', include_top=False, input_tensor=input_tensor)
# 加一层pooling，不然数据太大了！而且可以防止拟合
model = Model(inception_base_model.input, GlobalAveragePooling2D()(inception_base_model.output))
# 记录当前时间
start_time = datetime.now()

train_bottleneck_features = model.predict_generator(train_generator, train_data_size)
test_bottleneck_features = model.predict_generator(train_generator, test_data_size)

save_bottle_neck_features("inception_bottleneck_features", train_bottleneck_features, test_bottleneck_features, 
                          train_generator.classes)
    
end_time = datetime.now()

print("Extract feature finished! Total consumed: {} ".format(end_time - start_time))

Extract feature finished! Total consumed: 1:13:46.941855 


In [None]:
def model_train_and_predict(bottleneck_file_path):
    
    train_x = []
    train_y = []
    text_x = []
    
    with h5py.File(bottleneck_file_path, 'r') as h:
        train_x = np.array(h['train'])
        train_y = np.array(h['label'])
        text_x = np.array(h['test'])
    
    input_tensor = Input(train_x.shape[1:])
    x = Dropout(0.5)(x)
    x = Dense(1, activation='sigmod')(x)
    model = Model(input_tensor, x)
    # compile model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # fit model
    model.fit(train_x, train_y, batch_size=120, nb_epoch=10, validation_split=0.2)
    # predict
    preds = model.predict(text_x)
    # generate kaggle submission file
    
    

In [40]:
train_x = []
train_y = []
text_x = []

with h5py.File('bottleneck_features\inception_bottleneck_features.hdf5', 'r') as h:
    train_x = np.array(h['train'])
    train_y = np.array(h['label'])
    text_x = np.array(h['test'])
    
print(train_x.shape, train_y.shape, text_x.shape)

(399880, 2048) (25000,) (199944, 2048)


In [41]:
train_x = np.concatenate(train_x, axis=1)
text_x = np.concatenate(text_x, axis=1)

input_tensor = Input(train_x.shape[1:])
x = Dropout(0.5)(input_tensor)
x = Dense(1, activation='sigmoid')(x)
model = Model(input_tensor, x)
# compile model
model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])
# fit model
model.fit(train_x, train_y, batch_size=120, epochs=10, validation_split=0.2)



AxisError: axis 1 is out of bounds for array of dimension 1