In [None]:
import tensorflow_hub as hub
from keras.preprocessing import image
import numpy as np
import os
from numpy.linalg import norm
from tqdm import tqdm_notebook
import pickle

#### Get embedding of train mini images

In [None]:

# load model to produce our embeddings
model = hub.KerasLayer("https://tfhub.dev/google/inaturalist/inception_v3/feature_vector/5", trainable=False) # targetsize 299 for this

def extract_features(img_path, model, target_size=(299, 299)):
    """Load image to arary, resize, scale and expand dimensions."""
    img = image.load_img(img_path, target_size = target_size)
    img = image.img_to_array(img)
    
    # scale to [|0, 1]
    img = img / 255
    
    # expand dim
    img = np.expand_dims(img, axis = 0)
    
    # get embeddings for and flatten
    img_embeddings = model(img)
    img_embeddings = img_embeddings.numpy()[0]
        
    # normalise images
    img_embeddings /= norm(img_embeddings)

    return img_embeddings

In [None]:
# get list of images in directory, including nested directories within dir
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
def get_file_list(root_dir):
    file_list = []
    counter = 1
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                img_path = os.path.join(root, filename)
                if '.ipynb' in img_path: continue
                file_list.append(img_path)
                counter += 1
    return file_list

In [None]:
# path to the datasets; do birds as it is smaller subset
root_dir = './data/train_mini_supercategory/' 
filenames = sorted(get_file_list(root_dir))
num_images = len(filenames)
print(num_images, 'files found')

In [None]:
# define variable to store all our features
feature_list = []
for i in tqdm_notebook(range(len(filenames))):
    feature_list.append(extract_features(filenames[i], model))

In [None]:
# pickle our files
pickle.dump(feature_list, open('./models/trainmini-features-inceptionv3.pickle', 'wb'))
pickle.dump(filenames, open('./models/trainmini-filenames.pickle','wb'))

#### Load Pickle and turn to Faiss Index And Save

In [None]:
import faiss
import pickle
import sys
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [None]:
def get_top_1_topn(indexes, n=5):
    '''Ranking metric; of the returned top 5 results, how many times the correct
    result is in the top 1, and top 5.'''

    times_top1 = 0
    times_top5 = 0

    for i in tqdm(I):

        actual_name = classname_filename(filenames[i[0]])

        if actual_name == classname_filename(filenames[i[1]]):
            times_top1 += 1
            times_top5 += 1
            continue

        if actual_name in [classname_filename(filenames[ii]) for ii in i[1:]]:
            times_top5 += 1
            continue
            
    return times_top1, times_top5

def classname_filename(str):
    return str.split('/')[-2]

In [None]:
# # load features list and filenames
with open('./models/trainmini-features-inceptionv3.pickle', 'rb') as pickle_file:
    feature_list = pickle.load(pickle_file)
feature_list = np.array(feature_list).astype(np.float32)
    
with open('./models/trainmini-filenames.pickle', 'rb') as pickle_file:
    filenames = pickle.load(pickle_file)
num_images = len(filenames)

In [None]:
feature_list.shape

In [None]:
INDEX_KEY = "Flat"
use_gpu = True # false in this notebook

In [None]:
# build faiss index
index = faiss.index_factory(2048, INDEX_KEY)

if use_gpu:
    print('Using GPU')
    # if this fails, it means that the GPU version was not comp
    assert faiss.StandardGpuResources, \
        "FAISS was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
    res = faiss.StandardGpuResources()
    dev_no = 0

    # transfer to GPU (may be partial)
    index = faiss.index_cpu_to_gpu(res, dev_no, index)
    params = faiss.GpuParameterSpace()

print(index.is_trained)

In [None]:
%%timeit -r 1 -n 1
# add indexes
index.add(feature_list)

In [None]:
%%timeit -r 1 -n 1
D, I = index.search(feature_list[:5], 5) # sanity check, 5 Nearest-Neighbours
print(I) # indexes
print(D) # distances

In [None]:
#%%timeit -r 1 -n 1
# of n images, we will count the amount that match top-1, top 5
D, I = index.search(feature_list[:100_000], 5)

In [None]:
times_top1, times_top5 = get_top_1_topn(I, 5)
print('Times top1:', times_top1, '\nTimes top5:', times_top5)

In [None]:
# if gou; convert to cpu to save
# 4gb when we save the entire index 
sys.getsizeof(index)
if use_gpu:
    faiss.write_index(faiss.index_gpu_to_cpu(index), './models/trainmini-faiss.index')
if not use_gpu:   
    faiss.write_index(index, './models/trainmini-faiss.index')

#### Test FAISS with PCA

In [None]:
from sklearn.decomposition import PCA

num_feature_dimensions=128 # reduce to 128 dimensions
pca = PCA(n_components = num_feature_dimensions)
pca.fit(feature_list) # train PCA
feature_list_compressed = pca.transform(feature_list)

In [None]:
pickle.dump(pca, open("./models/pca-trainmini.pkl","wb"))

In [None]:
# build faiss index
index = faiss.index_factory(num_feature_dimensions, INDEX_KEY)

if use_gpu:
    print('Using GPU')
    # if this fails, it means that the GPU version was not comp
    assert faiss.StandardGpuResources, \
        "FAISS was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
    res = faiss.StandardGpuResources()
    dev_no = 0

    # transfer to GPU (may be partial)
    index = faiss.index_cpu_to_gpu(res, dev_no, index)
    params = faiss.GpuParameterSpace()

print(index.is_trained)

In [None]:
%%timeit -r 1 -n 1
# add indexes
index.add(feature_list_compressed)

In [None]:
# top 100k
D, I = index.search(feature_list_compressed[:100_000], 5)

In [None]:
times_top1, times_top5 = get_top_1_topn(I, 5)
print('Times top1:', times_top1, '\nTimes top5:', times_top5)

In [None]:
# if gpu; convert to cpu and save
if use_gpu:
    faiss.write_index(faiss.index_gpu_to_cpu(index), './models/pca-trainmini-faiss.index')
if not use_gpu:   
    faiss.write_index(index, './models/pca-trainmini-faiss.index')