In [1]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np

from pathlib import Path
from PIL import Image
import pickle
import pandas as pd
import os
import time
import ray
import psutil



In [70]:
# tf.debugging.set_log_device_placement(True)

# Feature extractor

In [48]:
class FeatureExtractor:
    def __init__(self):
        base_model = VGG16(weights='imagenet')
        self.model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

    def extract(self, img):
        """
        Extract a deep feature from an input image
        Args:
            img: from PIL.Image.open(path) or tensorflow.keras.preprocessing.image.load_img(path)

        Returns:
            feature (np.ndarray): deep feature with the shape=(4096, )
        """
        img = img.resize((224, 224))  # VGG must take a 224x224 img as an input
        img = img.convert('RGB')  # Make sure img is color
        x = image.img_to_array(img)  # To np.array. Height x Width x Channel. dtype=float32
        x = np.expand_dims(x, axis=0)  # (H, W, C)->(1, H, W, C), where the first elem is the number of img
        x = preprocess_input(x)  # Subtracting avg values for each pixel
        feature = self.model.predict(x)[0]  # (1, 4096) -> (4096, )
        return feature / np.linalg.norm(feature)  # Normalize
fe = FeatureExtractor()        

# Feature extract

In [16]:
### 폴더 내 모든 파일
img_paths = []
# for path, dirs, files in os.walk("../../data/sa+p/train"):
for path, dirs, files in os.walk("../../data"):
    for filename in files:
        ext = os.path.splitext(filename)[-1]
        if ext == '.jpg':
            img_paths.append(os.path.join(path, filename))
img_paths = img_paths[:50000]   ### TEST 용 1000개 이미지         

In [17]:
len(img_paths)

50000

In [49]:
num_cpus = psutil.cpu_count(logical=False)
try:
#     ray.init(num_cpus=num_cpus)
    ray.init(num_cpus=num_cpus, num_gpus=2)
except:
    ray.shutdown()
    ray.init(num_cpus=num_cpus, num_gpus=2)

2021-07-14 13:50:56,538	INFO services.py:1272 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [50]:
@ray.remote
def getFeatures(img):
    fe = FeatureExtractor()
    flist = np.array([ fe.extract(Image.open(ip)) for ip in img ])
    rst = [(img[i], v) for i, v in enumerate(flist)]
    return rst

In [51]:
def parallelize_job(img_paths, func, num_cpus):
    div = len(img_paths) // (num_cpus)
    div += 1 if div%num_cpus > 0 else 0    
    rst = [func.remote(img_paths[i*div:i*div+div]) for i in range(num_cpus)]
    rst = ray.get(rst)
    return rst

In [21]:
%time rst = parallelize_job(img_paths, getFeatures, num_cpus)

[2m[36m(pid=177924)[0m 2021-07-10 23:56:49.030529: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=177925)[0m 2021-07-10 23:56:49.007259: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=177926)[0m 2021-07-10 23:56:49.007259: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=177923)[0m 2021-07-10 23:56:49.033849: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=177924)[0m 2021-07-10 23:56:49.858932: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
[2m[36m(pid=177924)[0m 2021-07-10 23:56:49.889013: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_

CPU times: user 1min 55s, sys: 22.2 s, total: 2min 17s
Wall time: 1h 41min 14s


In [22]:
flist = []
for i, v in enumerate(rst):
    flist += v
f=open('source_feature.p', 'wb')
pickle.dump(flist, f)
f.close()

In [None]:
##############################################

In [194]:
### div 단위로 분할해서 저장
div = 600
num = (len(img_paths) // div) 
num += 1 if len(img_paths) % div > 0 else 0

f=open('source_feature.p', 'wb')
pickle.dump(img_paths, f)

for i in range(1, num+1):
    flist = np.array([ fe.extract(Image.open(ip)) for ip in img_paths[(i-1)*div:i*div] ])
    pickle.dump(flist, f)
f.close()

# Search

In [2]:
### 저장된 파일 읽어오기
f=open('source_feature.p', 'rb')
data = pickle.load(f)
len(data)

50000

In [7]:
source_file = []
source = []
for p, d in data:
    source_file.append(p.split('\\')[-1])
    source.append(d) # reshape(64,64) 안해도 되나?    
source = np.array(source)
source.shape

(50000, 4096)

## full search

In [11]:
# %%timeit
### 그룹 내 이미지 간 검색
result = []
bound = 0.5

for i in range(source.shape[0]-1):
    dists = np.linalg.norm(source[i+1:] - source[i], axis=1)  # L2 distances to features
    dist_df = pd.DataFrame(dists, columns=['dist'])
    dist_df = dist_df[dist_df.dist < bound].sort_values(by='dist')[:3]
    result += [(dist_df.loc[idx].dist, source_file[i], source_file[idx+i+1]) for idx in dist_df.index]

df = pd.DataFrame(result, columns = ['dist', 'query_img', 'source_img']).sort_values(by=['query_img', 'dist'])        

3.76 s ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [222]:
# ### 저장된 파일 읽어오기
# f=open('source_feature.p', 'rb')
# source_path = pickle.load(f)
# source = pickle.load(f)
# while True:
#     try:
#         source = np.concatenate((source, pickle.load(f)))
#     except:
#         break
# print(len(source_path))
# source.shape    

1000


(999, 4096)

In [39]:
### 검색할 이미지와 저장된 이미지가 다른 경우
p = r"C:\Users\hyosun87.you\Downloads\4월실험1-자체 영수증 위조\위조"
# p = r"C:\Users\hyosun87.you\Downloads\Low Quality"
query_path = sorted(Path(p).glob("*.jpg"))

In [68]:
%%timeit
### 서로 다른 이미지 간 검색
result = []
bound = 0.5
querys = np.array([ fe.extract(Image.open(qpath)) for qpath in query_path])
for i, v in enumerate(querys):
    dists = np.linalg.norm(source-v, axis=1)  # L2 distances to features
    dist_df = pd.DataFrame(dists, columns=['dist'])
    dist_df = dist_df[dist_df.dist < bound].sort_values(by='dist')[:3]
    for idx in dist_df.index:
        result.append((dist_df.loc[idx].dist, query_path[i].parts[-1], source_path[idx].parts[-1]))
df = pd.DataFrame(result, columns = ['dist', 'query_img', 'source_img']).sort_values(by=['query_img', 'dist'])        

3.91 s ± 60.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


12.3 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## nmslib

In [8]:
import nmslib

In [10]:
idx_path = 'model_index.idx'

In [45]:
%%timeit
index = nmslib.init(space='cosinesimil')
index.addDataPointBatch(source)
index_time_params = {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100}
index.createIndex(index_time_params)
index.saveIndex(idx_path)

24.1 s ± 176 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
m_idx = nmslib.init(space='cosinesimil')
m_idx.loadIndex(idx_path)

In [18]:
%%timeit
# neighbors, distances = m_idx.knnQueryBatch(np.expand_dims(source[0], axis = 0), k = 3, num_threads = 4)[0] ### 1장일 경우
s_rst = m_idx.knnQueryBatch(source, k = 3, num_threads = 10)
print(len(s_rst))

50000
50000
50000
50000
50000
50000
50000
50000
3.44 s ± 25.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
s_rst[:5]

[(array([   60,    90, 36036], dtype=int32),
  array([1.1920929e-07, 1.1920929e-07, 1.1920929e-07], dtype=float32)),
 (array([   31,    61, 36079], dtype=int32),
  array([0., 0., 0.], dtype=float32)),
 (array([   32,    92, 36038], dtype=int32),
  array([5.9604645e-08, 5.9604645e-08, 5.9604645e-08], dtype=float32)),
 (array([ 3, 63, 93], dtype=int32), array([0., 0., 0.], dtype=float32)),
 (array([    4,    64, 36040], dtype=int32),
  array([3.5762787e-07, 3.5762787e-07, 3.5762787e-07], dtype=float32))]

In [66]:
s_rst[0] # ([검색된 이미지의 인덱스], [해당 이미지와의 거리]) 로 묶여서 출력 됨

(array([   60,     0, 36078], dtype=int32),
 array([1.1920929e-07, 1.1920929e-07, 1.1920929e-07], dtype=float32))

Unnamed: 0,0,1
0,60.0,1.192093e-07
1,0.0,1.192093e-07
2,36078.0,1.192093e-07
