In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [None]:
perform_pca = False

# kmeans clusters
num_clusters = 200

In [None]:
# Compute features in all images from database

des_list = []
for path in image_paths:
    image = cv2.imread(path)
    surf = cv2.xfeatures2d.SURF_create()
    kp, des = surf.detectAndCompute(image,None)
    for row in range(des.shape[0]):
        des[row] = des[row]/np.linalg.norm(des[row])
    des_list.append((path, des))

data_size = len(des_list)

descriptors = des_list[0][1]
for image, des in  des_list:
    descriptors = np.vstack((descriptors, des))
    

In [None]:
# PCA
if perform_pca:
    pca = PCA(n_components=40)
    pca.fit(descriptors) 
    reduced_descriptors = pca.transform(descriptors)
else:
    reduced_descriptors = descriptors

In [None]:
# Cluster the descriptors from the images in the database

kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
kmeans.fit(reduced_descriptors)

# code_book = kmeans.cluster_centers_

In [None]:
# Assign each descriptor in database and query image to the closest cluster
image_features = np.zeros((data_size, num_clusters))
for i in range(data_size):
    pred = kmeans.predict(des_list[i][1])
    for k in range(len(pred)):
        image_features[i][pred[k]] += 1

In [None]:
# Build an inverted file index
inverted_file_index = [[] for _ in range(num_clusters)]
for i in range(data_size):
    for k in range(num_clusters):
        if(image_features[i][k] > 2):
            inverted_file_index[k].append(i)

In [None]:
# Compute a bag-of-words (BoW) vector for each retrieved image and query.
# This vector just counts the number of occurrences of each word. It has as
# many dimensions as there are visual words. Weight the vector with tf-idf.

total_counts = np.sum( (test_features > 0) * 1, axis = 0)
idf = np.log((1.0*data_size+1) / (1.0*total_counts + 1))

weighted_features = image_features * idf.reshape(1, -1)

for row in range(weighted_features.shape[0]):
    weighted_features[row] = weighted_features[row]/np.linalg.norm(weighted_features[row])

In [2]:
# For a query image, lookup all the visual words in the inverted file index to
# get a list of images that share at least one visual word with the query
def get_candidates(query_image):
    kp, des = sift.detectAndCompute(query_image,None)
    pred = kmeans.predict(des)
    query_features = np.zeros((1, num_clusters))

    for k in range(len(pred)):
        query_features[0][pred[k]] += 1

    candidates = []
    for i in range(num_clusters):
        if(query_features[0][i] > 2):
            candidates.extend(inverted_file_index[i])
    
    candidates = list(set(candidates))
    return query_features, candidates

In [None]:
# Compute similarity between query BoW vector and all retrieved image BoW
# vectors. Sort (highest to lowest). Take top K most similar images
def getTopCandidates(top_k, query_features, candidates):
    queryIdf = np.dot(idf, query_features)
    queryIdf /= np.linalg.norm(queryIdf)
    
    # take candidate features
    candidate_features = np.take(weighted_features, candidates)
    similarity = candidate_features * queryIdf.reshape(1, -1)
    
    # get top k (large to small)
    indices = np.argsort(-similarity)[0:top_k]
    return np.array([candidates[idx] for idx in indices])

In [None]:
# candidates: image index of candidates
query_features, candidates = get_candidates(query_image)

top_k = 10
# image index of top k candidates
top_k_candidates = getTopCandidates(top_k, query_features, candidates)