## 2. Feature-Extraction-from-VGG16-to-InceptionV3

### References:
1. https://github.com/ypwhs/dogs_vs_cats
2. https://www.kaggle.com/yangpeiwen/keras-inception-xception-0-47

## Run name

In [1]:
import time
project_name = 'Google-LandMark-Rec2019'
step_name = '2-Feature-Extraction-from-VGG16-to-InceptionV3'
time_str = time.strftime("%Y%m%d-%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google-LandMark-Rec2019_2-Feature-Extraction-from-VGG16-to-InceptionV3_20190511-015852


## Important params

In [2]:
import multiprocessing

cpu_amount = multiprocessing.cpu_count()
train_tar_count = 500

print('train_tar_count:', train_tar_count)
print('cpu_amount: ', cpu_amount)

train_tar_count: 500
cpu_amount:  4


## Import pkgs

In [3]:
import os, time, math, h5py, pickle

from keras.layers import *
from keras.models import *
from keras.applications import *
from keras.optimizers import *
from keras.regularizers import *
from keras.preprocessing.image import *
from keras.applications.vgg16 import preprocess_input as preprocess_input_vgg16
from keras.applications.vgg19 import preprocess_input as preprocess_input_vgg19
from keras.applications.inception_v3 import preprocess_input as preprocess_input_inception_v3

Using TensorFlow backend.


In [4]:
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF

config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.5
session = tf.Session(config=config)

KTF.set_session(session )

## Project folders

In [5]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

In [6]:
data_test_folder = os.path.join(input_folder, 'data_test')
if not os.path.exists(data_test_folder):
    os.mkdir(data_test_folder)
    print('create folder:', data_test_folder)
    
data_test_subfolder = os.path.join(data_test_folder, 'test')
if not os.path.exists(data_test_subfolder):
    os.mkdir(data_test_subfolder)
    print('create folder:', data_test_subfolder)
else:
    print('folder exists:', data_test_subfolder)

folder exists: /data/landmark-recognition-2019/input/data_test/test


## Functions

In [7]:
def pickle_dump(data, file):
    with open(file, 'wb') as f:
        pickle.dump(data, f)

def pickle_load(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

a = list(range(10))
print(a)
demo_file = os.path.join(os.getcwd(), 'temp', 'pickle_demo.pkl')
print(demo_file)
pickle_dump(a, demo_file)
new_a = pickle_load(demo_file)
print(new_a)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
/data/landmark-recognition-2019/temp/pickle_demo.pkl
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [8]:
def get_max_factor(num, batch_size):
    for i in range(batch_size):
        factor = batch_size - i
        if num % factor == 0:
            return factor
        
assert get_max_factor(10, 5) == 5
assert get_max_factor(10, 3) == 2
assert get_max_factor(10, 2) == 2
assert get_max_factor(10, 1) == 1
assert get_max_factor(10, 7) == 5

In [9]:
%%time
def get_image_amount(folder, is_show_progress=False):
    count = 0
    stack = [folder]
    while(stack):
        item = stack.pop(0)
        if os.path.isdir(item):
            sub_items = os.listdir(item)
            sub_path = [os.path.join(item, sub_item) for sub_item in sub_items]
            stack += sub_path
        else:
            count += 1
        if not is_show_progress:
            continue
        if count % 100 == 0:
            print('*', end='')
        if count % 10000 == 0:
            print('|')
    return count

print(get_image_amount(data_test_subfolder, True))

*|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
****************************************************************************************************|
*******************************************************************************

In [10]:
def get_features(MODEL, model_name, image_size, folder_name, batch_size=1, lambda_func=None):
    print('{0} start.'.format(model_name))
    start_time = time.time()
    
    width = image_size[0]
    height = image_size[1]
    input_tensor = Input((height, width, 3))
    x = input_tensor
    if lambda_func:
        print(lambda_func.__name__)
        x = Lambda(lambda_func)(x)
    base_model = MODEL(input_tensor=x, weights='imagenet', include_top=False)
    model = Model(base_model.input, GlobalAveragePooling2D()(base_model.output))

    cwd = os.getcwd()
    data_train_path = os.path.join(cwd, 'input', folder_name)
    print(data_train_path)
    image_amount = get_image_amount(data_train_path)
    batch_size = get_max_factor(image_amount, batch_size)
    print('image_amount: %d, batch_size: %d' % (image_amount, batch_size))
    
    gen = ImageDataGenerator()
    train_generator = gen.flow_from_directory(
        data_train_path, 
        image_size, 
        shuffle=False, 
        batch_size=batch_size
    )
    
    train_steps = math.floor(len(train_generator.classes)/batch_size)
    print('train_steps:', train_steps)
    train = model.predict_generator(
        train_generator, 
        steps=train_steps,
        max_queue_size=8*batch_size,
#         workers=cpu_amount, 
#         use_multiprocessing=False,
        verbose=1
    )
    
    class_indices_file = os.path.join(cwd, 'feature', 'feature_{0}_{1}_{2}_class_indices.pkl'.format(model_name, folder_name, 171023))
    if os.path.exists(class_indices_file):
        os.remove(class_indices_file)
    print(class_indices_file)
    pickle_dump(train_generator.class_indices, class_indices_file)

    h5py_file_name = os.path.join(cwd, 'feature', 'feature_{0}_{1}_{2}.h5'.format(model_name, folder_name, 171023))
    print(h5py_file_name)
    if os.path.exists(h5py_file_name):
        os.remove(h5py_file_name)
    with h5py.File(h5py_file_name) as h:
        h.create_dataset('x_%s' % folder_name, data=train)
        h.create_dataset('classes_%s' % folder_name, data=train_generator.classes)
        h.create_dataset('index_%s' % folder_name, data=train_generator.index_array)
#         h.create_dataset("val", data=val)
#         h.create_dataset("val_label", data=val_generator.classes)
#         h.create_dataset("test", data=test)

    print(train.shape)
    print(len(train_generator.classes))
#     print(val.shape)
#     print(len(val_generator.classes))
#     print(test.shape)
    
#     print(dir(train_generator))
    print(train_generator.samples)
    print(train_generator.image_shape)
    print(train_generator.classes)
    print(train_generator.num_classes)
    print(train_generator.batch_index)
    print(train_generator.index_generator)
    print(train_generator.index_array)
#     print(train_generator.class_indices)
    print(type(train_generator.class_indices))
    
    end_time = time.time()
    print('Spend time: {0} s'.format(end_time-start_time))
#     break

## Extract feature

In [11]:
# get_features(VGG16, 'VGG16', (224, 224), 'data_train', 1)
# get_features(VGG16, 'VGG16', (224, 224), 'data_val', 17, preprocess_input_vgg16)
# get_features(VGG16, (224, 224), 'data_test', 1)

In [12]:
# demo_file = os.path.join(os.getcwd(), 'feature', 'feature_VGG16_data_val_171023_class_indices.pkl')
# print(demo_file)
# # pickle_dump(a, demo_file)
# new_a = pickle_load(demo_file)
# print(new_a)

In [13]:
!ls ./input

50000_00       data_train_27  data_train_61
50000_01       data_train_28  data_train_62
50000_02       data_train_29  data_train_63
50000_03       data_train_30  data_train_64
50000_04       data_train_31  data_train_65
all_image      data_train_32  data_train_66
data_test      data_train_33  data_train_67
data_train_00  data_train_34  data_train_68
data_train_01  data_train_35  data_train_69
data_train_02  data_train_36  data_train_70
data_train_03  data_train_37  data_train_71
data_train_04  data_train_38  data_train_72
data_train_05  data_train_39  data_train_73
data_train_06  data_train_40  data_train_74
data_train_07  data_train_41  data_train_75
data_train_08  data_train_42  data_train_76
data_train_09  data_train_43  data_train_77
data_train_10  data_train_44  data_train_78
data_train_11  data_train_45  data_val_00
data_train_12  data_train_46  data_val_01
data_train_13  data_train_47  data_val_02
data_train_14  data_train_48  data_val_03
data_train_15  dat

In [14]:
!ls ./feature -hl

total 8.4G
-rw-r--r-- 1 ubuntu ubuntu 830K May  3 12:19 feature_VGG16_data_val_171023.h5
-rw-r--r-- 1 ubuntu ubuntu 631K May  6 17:05 feature_VGG19_data_train_00_171023_class_indices.pkl
-rw-r--r-- 1 ubuntu ubuntu  99M May  6 17:05 feature_VGG19_data_train_00_171023.h5
-rw-r--r-- 1 ubuntu ubuntu 634K May  6 17:30 feature_VGG19_data_train_01_171023_class_indices.pkl
-rw-r--r-- 1 ubuntu ubuntu  99M May  6 17:30 feature_VGG19_data_train_01_171023.h5
-rw-r--r-- 1 ubuntu ubuntu 634K May  6 18:01 feature_VGG19_data_train_02_171023_class_indices.pkl
-rw-r--r-- 1 ubuntu ubuntu  99M May  6 18:01 feature_VGG19_data_train_02_171023.h5
-rw-r--r-- 1 ubuntu ubuntu 631K May  6 18:33 feature_VGG19_data_train_03_171023_class_indices.pkl
-rw-r--r-- 1 ubuntu ubuntu  99M May  6 18:33 feature_VGG19_data_train_03_171023.h5
-rw-r--r-- 1 ubuntu ubuntu 634K May  6 19:06 feature_VGG19_data_train_04_171023_class_indices.pkl
-rw-r--r-- 1 ubuntu ubuntu  99M May  6 19:06 feature_VGG19_data_train_04_17102

In [15]:
# list_dir = list(os.listdir(input_folder))
# list_dir.sort()
# count = 0
# for sub_folder_name in list_dir:
#     if not (sub_folder_name.startswith('data_train_')):
#         continue
#     print(sub_folder_name)
#     sub_folder = os.path.join(input_folder, sub_folder_name)
#     if not os.path.isdir(sub_folder):
#         print('Folder don`t exists:', sub_folder)
#         continue
#     print(sub_folder)
#     count += 1
#     if count <= 22:
#         continue
#     get_features(VGG19, 'VGG19', (224, 224), sub_folder_name, 16, preprocess_input_vgg19)
    
# get_features(VGG19, 'VGG19', (224, 224), 'data_val', 1)

In [16]:
# new_a = pickle_load('''/data/landmark-recognition-2019/feature/feature_VGG19_data_train_00_171023_class_indices.pkl''')
# print(new_a)

In [17]:
get_features(VGG19, 'VGG19', (224, 224), 'data_test', 16, preprocess_input_vgg19)

VGG19 start.
wrapper
Instructions for updating:
Colocations handled automatically by placer.
/data/landmark-recognition-2019/input/data_test
image_amount: 112749, batch_size: 13
Found 112749 images belonging to 1 classes.
train_steps: 8673
/data/landmark-recognition-2019/feature/feature_VGG19_data_test_171023_class_indices.pkl
/data/landmark-recognition-2019/feature/feature_VGG19_data_test_171023.h5
(112749, 512)
112749
112749
(224, 224, 3)
[0 0 0 ... 0 0 0]
1
1
<generator object Iterator._flow_index at 0x7f64ceea8308>
[     0      1      2 ... 112746 112747 112748]
<class 'dict'>
Spend time: 8722.146973371506 s


In [18]:
# get_features(ResNet50, (224, 224), 1)

In [19]:
# get_features(Xception, (299, 299), 1, xception.preprocess_input)

In [20]:
# get_features(InceptionV3, (299, 299), 1, inception_v3.preprocess_input)

In [21]:
# get_features(InceptionResNetV2, (299, 299), 1, inception_v3.preprocess_input)

In [22]:
print('Time elapsed: %.1fs' % (time.time() - t0))
print(run_name)

Time elapsed: 8728.4s
Google-LandMark-Rec2019_2-Feature-Extraction-from-VGG16-to-InceptionV3_20190511-015852
