# # Feature extraction and reverse image search

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.imagenet_utils import decode_predictions, preprocess_input
from tensorflow.keras.models import Model
from sklearn.decomposition import PCA
from scipy.spatial import distance

In [None]:
# Dowload VGG19 model
model = tf.keras.applications.VGG19(weights='imagenet', include_top=True)
model.summary()

In [None]:
# Load and prepare img

def load_image(path):
    img = image.load_img(path, target_size=model.input_shape[1:3])
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return img, x

In [None]:
img, x = load_image('../input/wf-panerai-feat-extc/WF_panerai/109645_PAM00332_Luminor Marina.jpeg')
print("data type: ", x.dtype)
plt.imshow(img)

In [None]:
# Remove the last layer (the classification layer), so that the final layer of the new network, called feat_extractor is the second 4096-neuron fully-connected layer,"fc2 (Dense)"

feat_extractor = Model(inputs=model.input, outputs=model.get_layer("fc2").output)

In [None]:
model.input_shape

In [None]:
# Plotting the feature vector:
img, x = load_image('../input/wf-panerai-feat-extc/WF_panerai/109645_PAM00332_Luminor Marina.jpeg')
feat = feat_extractor.predict(x)
plt.figure(figsize=(16,4))
plt.plot(feat[0])

In [None]:
# Checking the directory from which images, the model will extract the feature vectors.
# These are the waches on stock

images_path = '../input/wf-panerai-feat-extc/WF_panerai'
image_extensions = ['.jpg', '.png', '.jpeg']  

images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(images_path) for f in filenames if os.path.splitext(f)[1].lower() in image_extensions]

print(f'Keeping {len(images)} watches on stock to analyze.')

In [None]:
# Extracting the feature vector from each image

features = []

for i, image_path in enumerate(images):
    img, x = load_image(image_path);
    feat = feat_extractor.predict(x)[0]
    features.append(feat)

print(f'Finished extracting features for {len(images)} images.')

In [None]:
np.array(features).shape

In [None]:
# )PCA) to reduce the dimensionality of our feature vector. 
# We apply PCA for two reasons: 
# - The 4096-bit feature vector may have some redundancy in it. 
# - Operating over 4096 elements is inefficient both in terms of space/memory.

features = np.array(features)
pca = PCA(n_components=200)
pca.fit(features)

In [None]:
pca_features = pca.transform(features)

In [None]:
features.shape

In [None]:
# grab a random query image
query_image_idx = int(len(images) * random.random())

# let's display the image
img = image.load_img(images[query_image_idx])
plt.imshow(img)

In [None]:
# Using cosine similarity to find similar feature vectors
similar_idx = [ distance.cosine(pca_features[query_image_idx], feat) for feat in pca_features ]

In [None]:
# The list `similar_idx` contains the image's similarity to every other one. We can sort that list and find the indexes of the most similar images. 
# The next cell will sort them, and then find the most similar items, and return the indexes 5 most similar images. 
# We take from indexes 1:6 rather than 0:5 because the most similar image to the query image, will trivially be the query image itself, 
# since it is included in the distance calculation. So we just skip it.

idx_closest = sorted(range(len(similar_idx)), key=lambda k: similar_idx[k])[1:6]

In [None]:
# Opens the images specified by idx_closest and concatenates them into a single image (resizing each so it has a height of 100 pixels.

In [None]:
thumbs = []
for idx in idx_closest:
    print(images[idx])
    img = image.load_img(images[idx])
    img = img.resize((int(img.width * 224 / img.height), 224))
    thumbs.append(img)

# concatenate the images into a single image
concat_image = np.concatenate([np.asarray(t) for t in thumbs], axis=1)

# show the image
plt.figure(figsize = (16,12))
plt.imshow(concat_image)

In [None]:
# Refactoring 

def get_closest_images(query_image_idx, num_results=5):
    distances = [ distance.cosine(pca_features[query_image_idx], feat) for feat in pca_features ]
    idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[1:num_results+1]
    return idx_closest

def get_concatenated_images(indexes, thumb_height):
    thumbs = []
    for idx in indexes:
        img = image.load_img(images[idx])
        img = img.resize((int(img.width * thumb_height / img.height), thumb_height))
        thumbs.append(img)
    concat_image = np.concatenate([np.asarray(t) for t in thumbs], axis=1)
    return concat_image

In [None]:
# load image and extract features
new_image, x = load_image('../input/panerai-models/Panerai_Models/Ferrari/Ferrari_129.jpeg')
new_features = feat_extractor.predict(x)

# project it into pca space
new_pca_features = pca.transform(new_features)[0]

# calculate its distance to all the other images pca feature vectors
distances = [ distance.cosine(new_pca_features, feat) for feat in pca_features ]
idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[0:5]  # grab first 5
results_image = get_concatenated_images(idx_closest, 200)

# display the results
plt.figure(figsize = (5,5))
plt.imshow(new_image)
plt.title("query image")

# display the resulting images
plt.figure(figsize = (16,12))
plt.imshow(results_image)
plt.title("result images")

In [None]:
# Exporting model VGG19
model.save('../models/VGG19_ft_ext.h5')