## 2. FeatureExtraction_ORB_Batch

## Run name

In [1]:
import time

project_name = 'Google_LandMark_Rec'
step_name = 'FeatureExtraction_ORB_Batch'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google_LandMark_Rec_FeatureExtraction_ORB_Batch_20180512_043724


## Important params

In [2]:
batch_size = 50000

## Import PKGs

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import sys
import gc
import math
import shutil
import zipfile
import pickle
import h5py
import cv2
from PIL import Image

from tqdm import tqdm
import multiprocessing

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

cpu_amount = multiprocessing.cpu_count()
print('cpu_amount: ', cpu_amount)

  from ._conv import register_converters as _register_converters


cpu_amount:  4


## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

org_train_folder = os.path.join(input_folder, 'org_train')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'data_val')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

train_csv_file = os.path.join(input_folder, 'train.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
sample_submission_folder = os.path.join(input_folder, 'sample_submission.csv')

## Preview csv

In [5]:
train_csv = pd.read_csv(train_csv_file)
print('train_csv.shape is {0}.'.format(train_csv.shape))
display(train_csv.head(2))

test_csv = pd.read_csv(test_csv_file)
print('test_csv.shape is {0}.'.format(test_csv.shape))
display(test_csv.head(2))

train_csv.shape is (1225029, 3).


Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651


test_csv.shape is (117703, 2).


Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...


In [6]:
train_id = train_csv['id']
train_landmark_id = train_csv['landmark_id']
print('len(train_landmark_id) = \t%s' % len(list(set(train_landmark_id))))

id_2_landmark_id_dict = dict(zip(train_id, train_landmark_id))
print('len(id_2_landmark_id_dict) = \t%d' % len(id_2_landmark_id_dict))

index = 0
print('id: %s, \tlandmark_id:%s' % (train_id[index], id_2_landmark_id_dict[train_id[index]]))
index = 1
print('id: %s, \tlandmark_id:%s' % (train_id[index], id_2_landmark_id_dict[train_id[index]]))

len(train_landmark_id) = 	14951
len(id_2_landmark_id_dict) = 	1225029
id: cacf8152e2d2ae60, 	landmark_id:4676
id: 0a58358a2afd3e4e, 	landmark_id:6651


## FeatureExtraction_ORB

In [7]:
def image_detect_and_compute(image_file, clf):
    """Detect and compute interest points and their descriptors."""
    img = cv2.imread(image_file)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    kp, des = clf.detectAndCompute(img, None)
    return des

n_features = 500
clf = cv2.ORB_create(n_features)

org_train_images = os.listdir(org_train_folder)[:10]
print(len(org_train_images))
image_file = os.path.join(org_train_folder, org_train_images[0])
print(image_file)

10
/data1/kaggle/landmark-recognition-challenge/input/org_train/69b846bd58c3f09a.jpg


## Official code

In [8]:
%%time
def dump_pickle_feature_batch(run_name, dataset_name, image_features, batch_num):
    image_features_file = os.path.join(feature_folder, 'feature_%s_%s_b%s.pickle' % (run_name, dataset_name, batch_num))
    print('Dump: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    pickle.dump(image_features, open(image_features_file, "wb"), True)

def load_pickle_feature_batch(run_name, dataset_name, batch_num):
    image_features_file = os.path.join(feature_folder, 'feature_%s_%s_b%s.pickle' % (run_name, dataset_name, batch_num))
    image_features = pickle.load(open(image_features_file, "rb"))
    print('Load: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    return image_features

# dump_pickle_feature(run_name, image_features)
# image_features = load_pickle_feature(run_name)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


In [9]:
def feature_extraction(folder, dataset_name, run_name=run_name, batch_size=500):
    image_names = os.listdir(folder)
    amount = len(image_names)
#     amount = 1000
    batch_count = int(amount / batch_size) + 1
    print('amount: %s, batch_count: %s' % (amount, batch_count))
    
    
    for i in range(batch_count):
        image_features = {}
        for j, image_name in enumerate(image_names[i*batch_size: (i+1)*batch_size]):
            image_id = image_name[:-4]
            image_file = os.path.join(folder, image_name)
            des = image_detect_and_compute(image_file, clf)
            image_features[image_id] = des
            if j < 3:
                print(image_name, image_id, end=' ')
                print(des.shape, end=' ')
                print(des[0][:10])
            if (j+1) % 1000 == 0:
                print(int((j+1)/1000), end=' ')
        dump_pickle_feature_batch(run_name, dataset_name, image_features, i)
        del image_features
        gc.collect()
        

In [None]:
feature_extraction(org_train_folder, 'train', run_name, batch_size)

amount: 1217684, batch_count: 25
69b846bd58c3f09a.jpg 69b846bd58c3f09a (500, 32) [128 230 157 251 108 201 109 205  86 119]
19a1de4f08cd0305.jpg 19a1de4f08cd0305 (500, 32) [139 120 170 191  44 191 187 208  30 235]
4ee821754ef5fd83.jpg 4ee821754ef5fd83 (500, 32) [148 167   6 171 154  11 110 105  46  49]
1 2 3 4 5 6 7 8 9 10 

In [None]:
image_features = load_pickle_feature_batch(run_name, 'train', 1)

print('*'*80)
print('len_image_features=', len(image_features.keys()))
for i, image_id in enumerate(list(image_features.keys())[:3]):
    print('image_id: %s,\t landmark_id:%s,\t feature_shape: ' % (image_id, id_2_landmark_id_dict[image_id]), image_features[image_id].shape, end=' ')
    print(image_features[image_id][0][:10])

In [None]:
feature_extraction(org_test_folder, 'test', run_name, batch_size)

In [None]:
image_features = load_pickle_feature_batch(run_name, 'test', 1)

print('*'*80)
print('len_image_features=', len(image_features.keys()))
for i, image_id in enumerate(list(image_features.keys())[:3]):
    print('image_id: %s,\t feature_shape: %s' % (image_id, image_features[image_id].shape), end=' ')
    print(image_features[image_id][0][:10])

In [None]:
print('Time cost: %.2f' % (time.time() - t0))