# 2. 特征提取_从VGG16到InceptionResNetV2

**References**:
- https://github.com/ypwhs/dogs_vs_cats
- https://www.kaggle.com/yangpeiwen/keras-inception-xception-0-47

## Run name

In [1]:
import time

project_name = 'Google_LandMark_Rec'
step_name = 'FeatureExtraction'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google_LandMark_Rec_FeatureExtraction_20180421_054321


## 导入包

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import gc
import math
import shutil
import zipfile
import pickle
import h5py
from PIL import Image

from tqdm import tqdm
from multiprocessing import cpu_count

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

  from ._conv import register_converters as _register_converters


## Project folders

In [3]:
cwd = os.getcwd()
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
feature_folder = os.path.join(cwd, 'feature')
post_pca_feature_folder = os.path.join(cwd, 'post_pca_feature')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t', input_folder)
print('output_folder: \t\t\t', output_folder)
print('model_folder: \t\t\t', model_folder)
print('feature_folder: \t\t', feature_folder)
print('post_pca_feature_folder: \t', post_pca_feature_folder)
print('log_folder: \t\t\t', log_folder)

org_train_folder = os.path.join(input_folder, 'org_train')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'val_train')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')
print('\norg_train_folder: \t\t', org_train_folder)
print('org_test_folder: \t\t', org_test_folder)
print('train_folder: \t\t\t', train_folder)
print('val_folder: \t\t\t', val_folder)
print('test_folder: \t\t\t', test_folder)
print('test_sub_folder: \t\t', test_sub_folder)

if not os.path.exists(post_pca_feature_folder):
    os.mkdir(post_pca_feature_folder)
    print('Create folder: %s' % post_pca_feature_folder)

input_folder: 			 /data1/kaggle/landmark-recognition-challenge/input
output_folder: 			 /data1/kaggle/landmark-recognition-challenge/output
model_folder: 			 /data1/kaggle/landmark-recognition-challenge/model
feature_folder: 		 /data1/kaggle/landmark-recognition-challenge/feature
post_pca_feature_folder: 	 /data1/kaggle/landmark-recognition-challenge/post_pca_feature
log_folder: 			 /data1/kaggle/landmark-recognition-challenge/log

org_train_folder: 		 /data1/kaggle/landmark-recognition-challenge/input/org_train
org_test_folder: 		 /data1/kaggle/landmark-recognition-challenge/input/org_test
train_folder: 			 /data1/kaggle/landmark-recognition-challenge/input/data_train
val_folder: 			 /data1/kaggle/landmark-recognition-challenge/input/val_train
test_folder: 			 /data1/kaggle/landmark-recognition-challenge/input/data_test
test_sub_folder: 		 /data1/kaggle/landmark-recognition-challenge/input/data_test/test


## 使用预训练权重的VGG16、VGG19、ResNet50、Xception、InceptionV3和InceptionResNetV2模型提取特征

In [4]:
from keras.layers import *
from keras.models import *
from keras.applications import *
from keras.optimizers import *
from keras.regularizers import *
from keras.preprocessing.image import *
from keras.applications.inception_v3 import preprocess_input

Using TensorFlow backend.


In [5]:
def get_features(MODEL, image_size, date_str, lambda_func=None, batch_size=1, is_aug=False):
    print('{0} start.'.format(MODEL.__name__))
    cpu_amount = cpu_count()
    print('cpu_amount: ', cpu_amount)
    start_time = time.time()
    width = image_size
    height = image_size
    
    cwd = os.getcwd()
    folder_path = os.path.join(cwd, 'feature')
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
        print('Created folder: %s' % folder_path)
    else:
        print('Existed folder: %s' % folder_path)
    file_name = os.path.join(folder_path, 'feature_{0}_{1}_{2}.h5'.format(MODEL.__name__, width, date_str))
    print(file_name)
    if os.path.exists(file_name):
        os.remove(file_name)
    
    input_tensor = Input((height, width, 3))
    x = input_tensor
    if lambda_func:
        print(lambda_func.__name__)
        x = Lambda(lambda_func)(x)
    base_model = MODEL(input_tensor=x, weights='imagenet', input_shape=(height, width, 3), include_top=False)
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))
    
    train_folder = os.path.join(cwd, 'input', 'data_train')
    val_folder = os.path.join(cwd, 'input', 'data_val')
    test_folder  = os.path.join(cwd, 'input', 'data_test')
    
    if is_aug:
        print('have augumentation')
        train_gen = ImageDataGenerator(zoom_range = 0.2,
                                 height_shift_range = 0.2,
                                 width_shift_range = 0.2,
                                 rotation_range = 20)
    else:
        print('do not have augumentation')
        train_gen = ImageDataGenerator()
    val_gen = ImageDataGenerator()
    test_gen = ImageDataGenerator()

    train_generator = train_gen.flow_from_directory(
        train_folder, 
        (image_size, image_size), 
        shuffle=False, 
        batch_size=batch_size
    )
    val_generator = val_gen.flow_from_directory(
        val_folder, 
        (image_size, image_size), 
        shuffle=False, 
        batch_size=batch_size
    )
    test_generator  = test_gen.flow_from_directory(
        test_folder,  
        (image_size, image_size), 
        shuffle=False, 
        batch_size=batch_size
    )
    
    
    print('train_generator')
    print(len(train_generator.filenames))
    print(train_generator.filenames[:10])
    train_generator_steps = math.ceil(len(train_generator.filenames)/batch_size)
    print('train_generator_steps=%d / %d = %d' % (len(train_generator.filenames), batch_size, train_generator_steps))
    train = model.predict_generator(train_generator, verbose=1, steps=train_generator_steps, max_queue_size=2048, workers=cpu_amount, use_multiprocessing=True)

    print('val_generator')
    print(len(val_generator.filenames))
    print(val_generator.filenames[:10])
    val_generator_steps = math.ceil(len(val_generator.filenames)/batch_size)
    print('val_generator_steps=%d' % val_generator_steps)
    print('val_generator_steps=%d / %d = %d' % (len(val_generator.filenames), batch_size, val_generator_steps))
    val = model.predict_generator(val_generator, verbose=1, steps=val_generator_steps, max_queue_size=2048, workers=cpu_amount, use_multiprocessing=True)

    print('test_generator')
    print(len(test_generator.filenames))
    print(test_generator.filenames[:10])
    test_generator_steps = math.ceil(len(test_generator.filenames)/batch_size)
    print('test_generator_steps=%d' % test_generator_steps)
    print('test_generator_steps=%d / %d = %d' % (len(test_generator.filenames), batch_size, test_generator_steps))
    test = model.predict_generator(test_generator, verbose=1, steps=test_generator_steps, max_queue_size=2048, workers=cpu_amount, use_multiprocessing=True)

    
    with h5py.File(file_name) as h:
        h.create_dataset("train", data=train)
        h.create_dataset("train_labels", data=train_generator.classes)
        h.create_dataset("val", data=val)
        h.create_dataset("val_labels", data=val_generator.classes)
        h.create_dataset("test", data=test)
    
    print(train.shape)
    print(train_generator.classes)
    print(val.shape)
    print(val_generator.classes)
    print(test.shape)
    
    end_time = time.time()
    print('Spend time: {0} s'.format(end_time-start_time))

In [6]:
def get_all_features(image_size=150, batch_size=1, is_aug=False):
    time_str = time.strftime("%Y%m%d-%H%M%S", time.localtime())
    print('*' * 60)
    print(time_str)
#     get_features(VGG16, image_size, time_str, vgg16.preprocess_input, batch_size, is_aug)
#     get_features(Xception, image_size, time_str, xception.preprocess_input, batch_size, is_aug)
#     get_features(InceptionV3, image_size, time_str, inception_v3.preprocess_input, batch_size, is_aug)
    get_features(InceptionResNetV2, image_size, time_str, inception_resnet_v2.preprocess_input, batch_size, is_aug)

In [7]:
get_all_features(300, 512, False)

************************************************************
20180421-054322
InceptionResNetV2 start.
cpu_amount:  8
Existed folder: /data1/kaggle/landmark-recognition-challenge/feature
/data1/kaggle/landmark-recognition-challenge/feature/feature_InceptionResNetV2_300_20180421-054322.h5
preprocess_input
do not have augumentation
Found 1193691 images belonging to 14951 classes.
Found 24362 images belonging to 14951 classes.
Found 115619 images belonging to 1 classes.
train_generator
1193691
['00000/0439f888c5af0e99.jpg', '00000/05aaa786f5c9e0d1.jpg', '00000/063e56d977e00da1.jpg', '00000/06ac932cbf89ce44.jpg', '00000/126ee1b60065dbd4.jpg', '00000/14d86ba0c00b16d1.jpg', '00000/17a57bb3fa8c2d4e.jpg', '00000/1886e9f023806d4a.jpg', '00000/1f7e7418023935ee.jpg', '00000/22e28089dac709f0.jpg']
train_generator_steps=1193691 / 512 = 2332
val_generator
24362
['00000/4e8ab93c1620e8a3.jpg', '00000/90187c0b6f3fa112.jpg', '00000/e8f5d139190cf632.jpg', '00003/1ecb7b8fbe3ad95f.jpg', '00003/2a71be02ed724

In [8]:
print('Time cost: %.2f s' % (time.time() - t0))

print(run_name)
print('Done!')

Time cost: 18166.43 s
Google_LandMark_Rec_FeatureExtraction_20180421_054321
Done!
