## Demo_faiss_ORB_GPU

Reference:
- [const FLANN_INDEX_HIERARCHICAL](https://docs.opencv.org/3.4/dc/d8c/namespacecvflann.html)

## Run name

In [30]:
import time

project_name = 'Google_LandMark_Rec'
step_name = 'Demo_faiss_ORB_GPU'
time_str = time.strftime("%Y%m%d_%H%M%S", time.localtime())
run_name = project_name + '_' + step_name + '_' + time_str
print('run_name: ' + run_name)
t0 = time.time()

run_name: Google_LandMark_Rec_Demo_faiss_ORB_GPU_20180528_101259


## Important params

In [2]:
feature_run_name = 'Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925'
# feature_run_name = 'Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180520_125411'
print(feature_run_name)

Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925


## Import PKGs

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
from IPython.display import display

import os
import sys
import gc
import math
import shutil
import zipfile
import pickle
import h5py
import cv2
from PIL import Image

from tqdm import tqdm
import multiprocessing

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

cpu_amount = multiprocessing.cpu_count()
print('cpu_amount: ', cpu_amount)

  from ._conv import register_converters as _register_converters


cpu_amount:  4


## Project folders

In [4]:
cwd = os.getcwd()
feature_folder = os.path.join(cwd, 'feature')
input_folder = os.path.join(cwd, 'input')
output_folder = os.path.join(cwd, 'output')
model_folder = os.path.join(cwd, 'model')

org_train_folder = os.path.join(input_folder, 'org_train')
org_test_folder = os.path.join(input_folder, 'org_test')
train_folder = os.path.join(input_folder, 'data_train')
val_folder = os.path.join(input_folder, 'data_val')
test_folder = os.path.join(input_folder, 'data_test')
test_sub_folder = os.path.join(test_folder, 'test')

train_csv_file = os.path.join(input_folder, 'train.csv')
test_csv_file = os.path.join(input_folder, 'test.csv')
sample_submission_folder = os.path.join(input_folder, 'sample_submission.csv')

## Preview csv

In [5]:
train_csv = pd.read_csv(train_csv_file)
print('train_csv.shape is {0}.'.format(train_csv.shape))
display(train_csv.head(2))

test_csv = pd.read_csv(test_csv_file)
print('test_csv.shape is {0}.'.format(test_csv.shape))
display(test_csv.head(2))

train_csv.shape is (1225029, 3).


Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651


test_csv.shape is (117703, 2).


Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...


In [6]:
train_id = train_csv['id']
train_landmark_id = train_csv['landmark_id']
print('len(train_landmark_id) = \t%s' % len(list(set(train_landmark_id))))

id_2_landmark_id_dict = dict(zip(train_id, train_landmark_id))
print('len(id_2_landmark_id_dict) = \t%d' % len(id_2_landmark_id_dict))

index = 0
print('id: %s, \tlandmark_id:%s' % (train_id[index], id_2_landmark_id_dict[train_id[index]]))
index = 1
print('id: %s, \tlandmark_id:%s' % (train_id[index], id_2_landmark_id_dict[train_id[index]]))

len(train_landmark_id) = 	14951
len(id_2_landmark_id_dict) = 	1225029
id: cacf8152e2d2ae60, 	landmark_id:4676
id: 0a58358a2afd3e4e, 	landmark_id:6651


## Load feature

In [7]:
%%time
def dump_pickle_feature_batch(run_name, dataset_name, batch_num, image_features):
    run_name_folder = os.path.join(feature_folder, run_name)
    if not os.path.exists(run_name_folder):
        os.mkdir(run_name_folder)
    image_features_file = os.path.join(run_name_folder, 'feature_%s_%s_b%s.pickle' % (run_name, dataset_name, batch_num))
    print('Dump: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    pickle.dump(image_features, open(image_features_file, "wb"), True)

def load_pickle_feature_batch(run_name, dataset_name, batch_num):
    run_name_folder = os.path.join(feature_folder, run_name)
    image_features_file = os.path.join(run_name_folder, 'feature_%s_%s_b%s.pickle' % (run_name, dataset_name, batch_num))
    image_features = pickle.load(open(image_features_file, "rb"))
    print('Load: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    return image_features

# dump_pickle_feature_batch(run_name, image_features)
image_features = load_pickle_feature_batch(feature_run_name, 'train', 0)

Load:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925/feature_Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925_train_b0.pickle  19991
CPU times: user 2.18 s, sys: 440 ms, total: 2.62 s
Wall time: 2.63 s


In [8]:
print('*'*80)
print('len_image_features=', len(image_features.keys()))
for i, image_id in enumerate(list(image_features.keys())[:3]):
    print('image_id: %s, \tlandmark_id:%s,\t feature_shape: ' % (image_id, id_2_landmark_id_dict[image_id]), image_features[image_id].shape)


********************************************************************************
len_image_features= 19991
image_id: 69b846bd58c3f09a, 	landmark_id:6051,	 feature_shape:  (500, 32)
image_id: 19a1de4f08cd0305, 	landmark_id:9179,	 feature_shape:  (500, 32)
image_id: 4ee821754ef5fd83, 	landmark_id:11301,	 feature_shape:  (500, 32)


In [9]:
def load_pickle_file_batch(image_features_file):
    image_features = pickle.load(open(image_features_file, "rb"))
    print('Load: ', image_features_file, end='  ')
    print(len(image_features.keys()))
    return image_features

In [10]:
def load_pickle_feature_all(feature_run_name, dataset_name):
    file_names = os.listdir(os.path.join(feature_folder, feature_run_name))
#     file_names = list(filter(lambda x: dataset_name in x, file_names))[:3]
    file_names = list(filter(lambda x: dataset_name in x, file_names))
    
    all_image_fatures = {}
    for file_name in file_names:
        image_features_file = os.path.join(feature_folder, feature_run_name, file_name)
        image_features = load_pickle_file_batch(image_features_file)
        all_image_fatures.update(image_features)
    return all_image_fatures

In [11]:
def load_pickle_feature_all_parallel(feature_run_name, dataset_name):
    file_names = os.listdir(os.path.join(feature_folder, feature_run_name))
#     file_names = list(filter(lambda x: dataset_name in x, file_names))[:3]
    file_names = list(filter(lambda x: dataset_name in x, file_names))
    file_names.sort()
    file_names = file_names[:5]
    all_image_fatures = {}
#     for file_name in file_names:
#         image_features_file = os.path.join(feature_folder, feature_run_name, file_name)
#         image_features = load_pickle_file_batch(image_features_file)
#         all_image_fatures.update(image_features)
#     return all_image_fatures

    image_features_files = [os.path.join(feature_folder, feature_run_name, file_name) for file_name in file_names]
    pool = multiprocessing.Pool(processes=cpu_amount)
    for image_features in tqdm(pool.imap(load_pickle_file_batch, image_features_files), total=len(image_features_files)):
        all_image_fatures.update(image_features)
    pool.close()
    pool.terminate()
    return all_image_fatures

In [12]:
%%time
all_image_fatures_train = load_pickle_feature_all_parallel(feature_run_name, 'train')

  0%|          | 0/5 [00:00<?, ?it/s]

Load:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925/feature_Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925_train_b11.pickle  19988
Load:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925/feature_Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925_train_b10.pickle  19991
Load:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925/feature_Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925_train_b1.pickle  19990
Load:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925/feature_Google_LandMark_Rec_FeatureExtraction_ORB_Parallel_20180519_141925_train_b0.pickle  19991
Load:  /data1/kaggle/landmark-recognition-challenge/feature/Google_LandMark_Rec_FeatureExtraction_

100%|██████████| 5/5 [00:09<00:00,  1.83s/it]


CPU times: user 1.43 s, sys: 2.01 s, total: 3.44 s
Wall time: 9.51 s


In [13]:
print('*'*80)
print('len_image_features=', len(all_image_fatures_train.keys()))
for i, image_id in enumerate(list(all_image_fatures_train.keys())[:3]):
    print('image_id: %s, \tlandmark_id:%s,\t feature_shape: ' % (image_id, id_2_landmark_id_dict[image_id]), all_image_fatures_train[image_id].shape)


********************************************************************************
len_image_features= 99953
image_id: 69b846bd58c3f09a, 	landmark_id:6051,	 feature_shape:  (500, 32)
image_id: 19a1de4f08cd0305, 	landmark_id:9179,	 feature_shape:  (500, 32)
image_id: 4ee821754ef5fd83, 	landmark_id:11301,	 feature_shape:  (500, 32)


In [14]:
key_train, key_val = train_test_split(list(all_image_fatures_train.keys()), test_size=0.5, random_state=2017)
print('len(key_train)=', len(key_train))
print('len(key_val)=', len(key_val))

len(key_train)= 49976
len(key_val)= 49977


## faiss Match

In [24]:
# import faiss
# dim = 32
# indx = faiss.IndexFlatL2(dim)
# print(indx.is_trained)

In [16]:
# %%time

# labels = []
# count = 0
# for image_id in tqdm(key_train, total=len(key_train)):
#     feature = all_image_fatures_train[image_id].astype('float32')
#     landmark_id = id_2_landmark_id_dict[image_id]
# #     print('image_id: %s, \tlandmark_id:%s,\t feature_shape: ' % (image_id, landmark_id), feature.shape)
# #     print(feature[:100].shape)
#     indx.add(feature[:100])
#     labels.append(np.ones((feature.shape[0], 1)) * landmark_id)
#     del all_image_fatures_train[image_id]

#     count += 1
#     if count % 20000 == 0:
# #         print(count)
#         gc.collect()

In [18]:
# labels = np.concatenate(labels, axis=0)
# print(labels.shape)
# print(indx.ntotal)

In [19]:
# %%time
# k = 10
# feats = all_image_fatures_train[key_val[0]].astype('float32')
# D, I = indx.search(feats[:100], k)
# print(D.shape)
# print(I.shape)

In [20]:
# minlength = 50000

# y = np.array(list(map(lambda n: n if n <= minlength else 0, D.reshape(-1)))).astype('int32')
# y = np.bincount(y)
# print(y)
# print(ii)

In [25]:
import faiss
dim = 32

res = faiss.StandardGpuResources()  # use a single GPU
# build a flat (CPU) index
index_flat = faiss.IndexFlatL2(dim)
# make it into a gpu index
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

In [None]:
# gpu_index_flat.add(xb)         # add vectors to the index
# print(gpu_index_flat.ntotal)

# k = 4                          # we want to see 4 nearest neighbors
# D, I = gpu_index_flat.search(xq, k)  # actual search
# print(I[:5])                   # neighbors of the 5 first queries
# print(I[-5:])                  # neighbors of the 5 last queries

In [26]:
%%time

labels = []
count = 0
for image_id in tqdm(key_train, total=len(key_train)):
    feature = all_image_fatures_train[image_id].astype('float32')
    landmark_id = id_2_landmark_id_dict[image_id]
#     print('image_id: %s, \tlandmark_id:%s,\t feature_shape: ' % (image_id, landmark_id), feature.shape)
#     print(feature[:100].shape)
    gpu_index_flat.add(feature[:100])
    labels.append(np.ones((feature.shape[0], 1)) * landmark_id)
    del all_image_fatures_train[image_id]

    count += 1
    if count % 20000 == 0:
#         print(count)
        gc.collect()

100%|██████████| 49976/49976 [30:22<00:00, 27.42it/s] 

CPU times: user 5min 27s, sys: 25min 1s, total: 30min 28s
Wall time: 30min 22s





In [27]:
labels = np.concatenate(labels, axis=0)
print(labels.shape)
print(indx.ntotal)

(24636101, 1)
0


In [29]:
%%time
k = 10
feats = all_image_fatures_train[key_val[0]].astype('float32')
D, I = indx.search(feats[:100], k)
print(D.shape)
print(I.shape)

(100, 10)
(100, 10)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 540 µs
