## 2.  FeatureExtraction_SIFT_Parallel

## Run name

In [1]:
import time

project_name = 'Google_LandMark_Rec'
step_name = 'FeatureExtraction_SIFT_Parallel'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180527_100055


## Important params

In [2]:
batch_size = 10000

## Import PKGs

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import sys
import gc
import math
import tqdm
import shutil
import zipfile
import pickle
import h5py
import cv2
from PIL import Image

from tqdm import tqdm
import multiprocessing

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

cpu_amount = multiprocessing.cpu_count()
print('cpu_amount: ', cpu_amount)

  from ._conv import register_converters as _register_converters


cpu_amount:  36


## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

org_train_folder = os.path.join(input_folder, 'org_train')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'data_val')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

train_csv_file = os.path.join(input_folder, 'train.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
sample_submission_folder = os.path.join(input_folder, 'sample_submission.csv')

## Preview csv

In [5]:
train_csv = pd.read_csv(train_csv_file)
print('train_csv.shape is {0}.'.format(train_csv.shape))
display(train_csv.head(2))

test_csv = pd.read_csv(test_csv_file)
print('test_csv.shape is {0}.'.format(test_csv.shape))
display(test_csv.head(2))

train_csv.shape is (1225029, 3).


Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651


test_csv.shape is (117703, 2).


Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...


In [6]:
train_id = train_csv['id']
train_landmark_id = train_csv['landmark_id']
print('len(train_landmark_id) = \t%s' % len(list(set(train_landmark_id))))

id_2_landmark_id_dict = dict(zip(train_id, train_landmark_id))
print('len(id_2_landmark_id_dict) = \t%d' % len(id_2_landmark_id_dict))

index = 0
print('id: %s, \tlandmark_id:%s' % (train_id[index], id_2_landmark_id_dict[train_id[index]]))
index = 1
print('id: %s, \tlandmark_id:%s' % (train_id[index], id_2_landmark_id_dict[train_id[index]]))

len(train_landmark_id) = 	14951
len(id_2_landmark_id_dict) = 	1225029
id: cacf8152e2d2ae60, 	landmark_id:4676
id: 0a58358a2afd3e4e, 	landmark_id:6651


## FeatureExtraction

In [7]:
def image_detect_and_compute(image_file, clf):
    """Detect and compute interest points and their descriptors."""
    img = cv2.imread(image_file)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    des = None
    try:
        kp, des = clf.detectAndCompute(img, None)
    except Exception as ex:
        print(image_file, ex)
    return des

n_features = 100
# clf = cv2.ORB_create(n_features)
clf = cv2.xfeatures2d.SIFT_create(n_features)

org_train_images = os.listdir(org_train_folder)[:10]
print(len(org_train_images))
image_file = os.path.join(org_train_folder, org_train_images[0])
print(image_file)

10
/data1/kaggle/landmark-recognition-challenge/input/org_train/69b846bd58c3f09a.jpg


## Official code

In [8]:
%%time
def dump_pickle_feature_batch(run_name, dataset_name, batch_num, image_features):
    run_name_folder = os.path.join(feature_folder, run_name)
    if not os.path.exists(run_name_folder):
        os.mkdir(run_name_folder)
    image_features_file = os.path.join(run_name_folder, 'feature_%s_%s_b%s.pickle' % (run_name, dataset_name, batch_num))
    print('Dump: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    pickle.dump(image_features, open(image_features_file, "wb"), True)

def load_pickle_feature_batch(run_name, dataset_name, batch_num):
    run_name_folder = os.path.join(feature_folder, run_name)
    image_features_file = os.path.join(run_name_folder, 'feature_%s_%s_b%s.pickle' % (run_name, dataset_name, batch_num))
    image_features = pickle.load(open(image_features_file, "rb"))
    print('Load: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    return image_features

# dump_pickle_feature(run_name, image_features)
# image_features = load_pickle_feature(run_name)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 22.4 µs


In [9]:
def feature_extraction_batch(batch_num, folder, run_name, dataset_name, batch_size):
    print('batch=%s %s %s %s %s' % (batch_num, folder, run_name, dataset_name, batch_size))
    image_names = os.listdir(folder)
    img_names_batch = image_names[batch_num*batch_size: (batch_num+1)*batch_size]
    print(len(img_names_batch), end=' --> ')
    
    clf = cv2.xfeatures2d.SIFT_create(n_features)
    
    image_features = {}
    for j, image_name in enumerate(img_names_batch):
        image_id = image_name[:-4]
        image_file = os.path.join(folder, image_name)
        des = image_detect_and_compute(image_file, clf)
        if des is not None:
            image_features[image_id] = des
    print(len(image_features.keys()), end='  ')
    dump_pickle_feature_batch(run_name, dataset_name, batch_num, image_features)
    print('')
    del clf
    del image_features
    gc.collect()

In [10]:
from functools import partial

def feature_extraction(folder, run_name, dataset_name, batch_size=500, cpu_amount=cpu_amount):
    image_names = os.listdir(folder)
    amount = len(image_names)
#     amount = 1000
    batch_count = math.ceil(amount / batch_size)
    print('amount: %s, batch_count: %s' % (amount, batch_count))
    batch_nums = list(range(batch_count))
    print(batch_nums)
#     batches = []
#     for i in range(batch_count):
#         img_names_batch = image_names[i*batch_size: (i+1)*batch_size]
#         batches.append(img_names_batch)
#     print(len(batches))
#     print(len(batches[0]))
    
    function_partial = partial(
        feature_extraction_batch,
        folder=folder,
        dataset_name=dataset_name,
        run_name=run_name,
        batch_size=batch_size
    )
    pool = multiprocessing.Pool(processes=cpu_amount)
    for i in tqdm(pool.imap_unordered(function_partial, batch_nums), total=len(batch_nums)):
        pass
    pool.close()
    pool.terminate()

In [None]:
feature_extraction(org_train_folder, run_name, 'train', batch_size)
# feature_extraction(org_train_folder, run_name, 'train', 150)

amount: 1217684, batch_count: 122
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121]
batch=0 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000


  0%|          | 0/122 [00:00<?, ?it/s]

batch=17 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=5 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=15 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=8 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=14 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=16 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=4 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtrac

batch=56 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=57 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=58 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=59 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=60 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=61 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=62 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExt

batch=107 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=108 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=109 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=110 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=111 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
batch=112 /data1/kaggle/landmark-recognition-challenge/input/org_train Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917 train 10000
10000 --> /data1/kaggle/landmark-recognition-challenge/input/org_train/f106488d9205e994.jpg Op

  1%|          | 1/122 [3:09:46<382:41:59, 11386.11s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b58.pickle  10000



  2%|▏         | 2/122 [3:10:13<190:13:28, 5706.74s/it] 

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b77.pickle  10000



  2%|▏         | 3/122 [3:15:07<128:59:45, 3902.40s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b40.pickle  10000



  3%|▎         | 4/122 [3:15:17<96:00:51, 2929.25s/it] 

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b39.pickle  10000



  4%|▍         | 5/122 [3:16:47<76:44:56, 2361.51s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b87.pickle  10000



  5%|▍         | 6/122 [3:17:28<63:37:58, 1974.81s/it]

9999  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b47.pickle  9999
10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b92.pickle  10000



  6%|▌         | 7/122 [3:18:44<54:24:58, 1703.46s/it]




  7%|▋         | 8/122 [3:18:47<47:12:42, 1490.90s/it]

9998  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b106.pickle  9998



  7%|▋         | 9/122 [3:19:56<41:50:21, 1332.93s/it]

10000 --> 9999  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b53.pickle  9999



  8%|▊         | 10/122 [3:20:41<37:27:45, 1204.15s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b63.pickle  10000



  9%|▉         | 11/122 [3:21:17<33:51:14, 1097.96s/it]

9998  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b116.pickle  9998
10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b117.pickle  10000



 10%|▉         | 12/122 [3:21:24<30:46:12, 1007.02s/it]




 11%|█         | 13/122 [3:21:26<28:09:03, 929.76s/it] 

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b115.pickle  10000



 11%|█▏        | 14/122 [3:21:46<25:56:36, 864.79s/it]

10000 --> 9999  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b109.pickle  9999



 12%|█▏        | 15/122 [3:25:44<24:27:37, 822.97s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b120.pickle  10000



 13%|█▎        | 16/122 [3:26:29<22:47:58, 774.32s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b113.pickle  10000



 14%|█▍        | 17/122 [3:26:35<21:16:02, 729.17s/it]

10000 --> 10000  Dump:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917/feature_Google_LandMark_Rec_FeatureExtraction_SIFT_Parallel_20180523_023917_train_b110.pickle  10000



 15%|█▍        | 18/122 [3:27:12<19:57:11, 690.69s/it]

In [None]:
image_features = load_pickle_feature_batch(run_name, 'train', 1)

print('*'*80)
print('len_image_features=', len(image_features.keys()))
for i, image_id in enumerate(list(image_features.keys())[:3]):
    print('image_id: %s,\t landmark_id:%s,\t feature_shape: ' % (image_id, id_2_landmark_id_dict[image_id]), image_features[image_id].shape, end=' ')
    print(image_features[image_id][0][:10])

In [None]:
feature_extraction(org_test_folder, run_name, 'test', batch_size)
# feature_extraction(org_test_folder, run_name, 'test', 150)

In [None]:
image_features = load_pickle_feature_batch(run_name, 'test', 1)

print('*'*80)
print('len_image_features=', len(image_features.keys()))
for i, image_id in enumerate(list(image_features.keys())[:3]):
    print('image_id: %s,\t feature_shape: %s' % (image_id, image_features[image_id].shape), end=' ')
    print(image_features[image_id][0][:10])

In [None]:
print('Time cost: %.2f s' % (time.time() - t0))