# 2. PCA
Reference：
- http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.IncrementalPCA.html#sklearn.decomposition.IncrementalPCA

## Import PKGs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import gc
import time
import zipfile
import pickle
from PIL import Image
import shutil

from tqdm import tqdm
import multiprocessing

## Run name

In [2]:
project_name = 'ic_furniture2018'
step_name = 'PCA'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)

run_name: ic_furniture2018_PCA_20180331_153834


## Project folders

In [3]:
cwd = os.getcwd()
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')
feature_folder = os.path.join(cwd, 'feature')
post_pca_feature_folder = os.path.join(cwd, 'post_pca_feature')
log_folder = os.path.join(cwd, 'log')
print('input_folder: \t\t\t%s' % input_folder)
print('output_folder: \t\t\t%s' % output_folder)
print('model_folder: \t\t\t%s' % model_folder)
print('feature_folder: \t\t%s' % feature_folder)
print('post_pca_feature_folder: \t%s' % post_pca_feature_folder)
print('log_folder: \t\t\t%s' % log_folder)

org_train_folder = os.path.join(input_folder, 'org_train')
org_val_folder = os.path.join(input_folder, 'org_val')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'data_val')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

if not os.path.exists(post_pca_feature_folder):
    os.mkdir(post_pca_feature_folder)
    print('Create folder: %s' % post_pca_feature_folder)

train_json_file = os.path.join(input_folder, 'train.json')
val_json_file = os.path.join(input_folder, 'validation.json')
test_json_file = os.path.join(input_folder, 'test.json')
print('\ntrain_json_file: \t\t%s' % train_json_file)
print('val_json_file: \t\t\t%s' % val_json_file)
print('test_json_file: \t\t%s' % test_json_file)

train_csv_file = os.path.join(input_folder, 'train.csv')
val_csv_file = os.path.join(input_folder, 'validation.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
print('\ntrain_csv_file: \t\t%s' % train_csv_file)
print('val_csv_file: \t\t\t%s' % val_csv_file)
print('test_csv_file: \t\t\t%s' % test_csv_file)

sample_submission_csv_file = os.path.join(input_folder, 'sample_submission_randomlabel.csv')
print('\nsample_submission_csv_file: \t%s' % sample_submission_csv_file)

input_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input
output_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/output
model_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/model
feature_folder: 		/data1/kaggle/imaterialist-challenge-furniture-2018/feature
post_pca_feature_folder: 	/data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature
log_folder: 			/data1/kaggle/imaterialist-challenge-furniture-2018/log

train_json_file: 		/data1/kaggle/imaterialist-challenge-furniture-2018/input/train.json
val_json_file: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input/validation.json
test_json_file: 		/data1/kaggle/imaterialist-challenge-furniture-2018/input/test.json

train_csv_file: 		/data1/kaggle/imaterialist-challenge-furniture-2018/input/train.csv
val_csv_file: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input/validation.csv
test_csv_file: 			/data1/kaggle/imaterialist-challenge-furniture-2018/input/test.csv


## Load feature

In [4]:
%%time
import h5py
import numpy as np
np.random.seed(2018)


def load_h5(model_name, time_str):
    x_train = {}
    y_train = {}
    x_val = {}
    y_val = {}
    x_test = {}
    
    feature_h5_file = os.path.join(feature_folder, 'feature_%s_%s.h5' % (model_name, time_str))
    print(feature_h5_file)
    for filename in [feature_h5_file]:
        with h5py.File(filename, 'r') as h:
            x_train = np.array(h['train'])
            y_train = np.array(h['train_labels'])
            x_val = np.array(h['val'])
            y_val = np.array(h['val_labels'])
            x_test = np.array(h['test'])
    return x_train, y_train, x_val, y_val, x_test

def save_h5(n_components, model_name, time_str, x_train, y_train, x_val, y_val, x_test):
    feature_h5_file = os.path.join(post_pca_feature_folder, 'post_pca_feature_%s_%s_%s.h5' % (model_name, n_components, time_str))
    print(feature_h5_file)
    if os.path.exists(feature_h5_file):
        os.remove(feature_h5_file)
        print('File removed: %s' % feature_h5_file)
        
    with h5py.File(feature_h5_file) as h:
        h.create_dataset("train", data=x_train)
        h.create_dataset("train_labels", data=y_train)
        h.create_dataset("val", data=x_val)
        h.create_dataset("val_labels", data=y_val)
        h.create_dataset("test", data=x_test)

def reduce_demension(n_components, model_name, time_str):
    t0 = time.time()
    x_train, y_train, x_val, y_val, x_test = load_h5(model_name, time_str)
    ipca = IncrementalPCA(n_components=n_components, batch_size=512, copy=False)
    ipca.fit(x_train)
    ipca.fit(x_val)
    ipca.fit(x_test)
    red_x_train = ipca.transform(x_train)
    red_x_val = ipca.transform(x_val)
    red_x_test = ipca.transform(x_test)
    print(red_x_train.shape)
    print(red_x_val.shape)
    print(red_x_test.shape)
    save_h5(n_components, model_name, time_str, red_x_train, y_train, red_x_val, y_val, red_x_test)
    t1 = time.time()
    print('Spend time: %.2f s' % (t1 - t0))

CPU times: user 48 ms, sys: 8 ms, total: 56 ms
Wall time: 52.5 ms


  from ._conv import register_converters as _register_converters


## PCA reduce dimension

In [5]:
%time
from sklearn.decomposition import IncrementalPCA, PCA

t00 = time.time()
n_components = 512

model_names = ['Xception', 'InceptionV3', 'InceptionResNetV2']
# time_strs = ['20180329-164850']
time_strs = ['20180329-164850']

for time_str in time_strs:
    for model_name in model_names:
        reduce_demension(n_components, model_name, time_str)

t01 = time.time()
print('Spend time: %.2f' % (t01 - t00))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.25 µs
/data1/kaggle/imaterialist-challenge-furniture-2018/feature/feature_Xception_20180329-164850.h5


  explained_variance[self.n_components_:].mean()
  ret = ret.dtype.type(ret / rcount)


(191261, 512)
(6301, 512)
(12652, 512)
/data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature/post_pca_feature_Xception_512_20180329-164850.h5
File removed: /data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature/post_pca_feature_Xception_512_20180329-164850.h5
Spend time: 890.27 s
/data1/kaggle/imaterialist-challenge-furniture-2018/feature/feature_InceptionV3_20180329-164850.h5
(191261, 512)
(6301, 512)
(12652, 512)
/data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature/post_pca_feature_InceptionV3_512_20180329-164850.h5
File removed: /data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature/post_pca_feature_InceptionV3_512_20180329-164850.h5
Spend time: 496.58 s
/data1/kaggle/imaterialist-challenge-furniture-2018/feature/feature_InceptionResNetV2_20180329-164850.h5
(191261, 512)
(6301, 512)
(12652, 512)
/data1/kaggle/imaterialist-challenge-furniture-2018/post_pca_feature/post_pca_feature_InceptionResNetV2_512_20180329-164850.h5
File