# 2. PCA

Reference：
- http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA

## Run name

In [1]:
import time
import os
import pandas as pd

project_name = 'Google_LandMark_Rec'
step_name = 'PCA'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: Google_LandMark_Rec_PCA_20180326_123654


## 项目文件夹

In [2]:
cwd = os.getcwd()
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
feature_folder = os.path.join(cwd, 'feature')
post_pca_feature_folder = os.path.join(cwd, 'post_pca_feature')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t' + input_folder)
print('output_folder: \t\t\t' + output_folder)
print('model_folder: \t\t\t' + model_folder)
print('feature_folder: \t\t' + feature_folder)
print('post_pca_feature_folder: \t' + post_pca_feature_folder)
print('log_folder: \t\t\t' + log_folder)

org_train_folder = os.path.join(input_folder, 'org_train')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

if not os.path.exists(post_pca_feature_folder):
    os.mkdir(post_pca_feature_folder)
    print('Create folder: %s' % post_pca_feature_folder)

input_folder: 			/data1/kaggle/landmark-recognition-challenge/input
output_folder: 			/data1/kaggle/landmark-recognition-challenge/output
model_folder: 			/data1/kaggle/landmark-recognition-challenge/model
feature_folder: 		/data1/kaggle/landmark-recognition-challenge/feature
post_pca_feature_folder: 	/data1/kaggle/landmark-recognition-challenge/post_pca_feature
log_folder: 			/data1/kaggle/landmark-recognition-challenge/log


## 加载feature

In [3]:
%%time
import h5py
import numpy as np
from sklearn.utils import shuffle
np.random.seed(2018)

x_train = []
y_train = {}
x_val = []
y_val = {}
x_test = []

model_name = 'InceptionResNetV2'
image_size = 200
time_str = '20180312-050926'
cwd = os.getcwd()

feature_model = os.path.join(cwd, 'feature', 'feature_%s_%s_%s.h5' % (model_name, image_size, time_str))

# feature_cgg16 = os.path.join(cwd, 'feature', 'feature_VGG16_{}.h5'.format(20180219))
# feature_cgg19 = os.path.join(cwd, 'feature', 'feature_VGG19_{}.h5'.format(20180219))
# feature_resnet50 = os.path.join(cwd, 'feature', 'feature_ResNet50_{}.h5'.format(20180220))
# feature_xception = os.path.join(cwd, 'feature', 'feature_Xception_%s_%s.h5' % (image_size, time_str))
# feature_inceptionV3 = os.path.join(cwd, 'feature', 'feature_InceptionV3_%s_%s.h5' % (image_size, time_str))
# feature_inceptionResNetV2 = os.path.join(cwd, 'feature', 'feature_InceptionResNetV2_%s_%s.h5' % (image_size, time_str))
# for filename in [feature_cgg16, feature_cgg19, feature_resnet50, feature_xception, feature_inception, feature_inceptionResNetV2]:
for filename in [feature_model]:
    with h5py.File(filename, 'r') as h:
        x_train.append(np.array(h['train']))
        y_train = np.array(h['train_labels'])
#         x_val.append(np.array(h['val']))
#         y_val = np.array(h['val_labels'])
        x_test.append(np.array(h['test']))

  from ._conv import register_converters as _register_converters


CPU times: user 2.62 s, sys: 9.49 s, total: 12.1 s
Wall time: 5min 3s


In [4]:
print(x_train[0].shape)
print(len(y_train))
print(x_test[0].shape)

(1219426, 1536)
1219426
(115942, 1536)


In [5]:
%%time
x_train = np.concatenate(x_train, axis=-1)
x_test = np.concatenate(x_test, axis=-1)
print(x_train.shape)
print(x_test.shape)

(1219426, 1536)
(115942, 1536)
CPU times: user 2.05 s, sys: 3.23 s, total: 5.28 s
Wall time: 5.27 s


In [6]:
# %%time
# from sklearn.utils import shuffle
# (x_train, y_train) = shuffle(x_train, y_train)

## PCA降维

In [None]:
%time
from sklearn.decomposition import IncrementalPCA, PCA
t0 = time.time()
n_components = 32

ipca = IncrementalPCA(n_components=n_components, batch_size=512, copy=False)
ipca.fit(x_train)
ipca.fit(x_test)
red_x_train = ipca.transform(x_train)
red_x_test = ipca.transform(x_test)

print(red_x_train.shape)
print(red_x_test.shape)
t1 = time.time()
print('Spend time: {0} s'.format(t1-t0))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.25 µs


In [None]:
def save_2_h5py(file_name, x_train, y_train, x_test):
    if os.path.exists(file_name):
        print('Remove file: %s' % file_name)
        os.remove(file_name)
    with h5py.File(file_name) as h:
        h.create_dataset("train", data=x_train)
        h.create_dataset("train_labels", data=y_train)
        h.create_dataset("test", data=x_test)

In [None]:
%%time
post_pca_feature_file = os.path.join(cwd, 'post_pca_feature', 'post_pca_feature_%s_%s_%s_%s.h5' % (model_name, n_components, image_size, time_str))
print(post_pca_feature_file)
save_2_h5py(post_pca_feature_file, red_x_train, y_train, red_x_test)

In [None]:
print('Done!')