In [262]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from skimage import io, color
from skimage.transform import resize
from sklearn.cluster import KMeans
import math
import argparse
from numpy.linalg import norm
import os
import random
import pickle
from joblib import dump, load
import pandas as pd
import time

In [42]:
train_path = "./coco128/images/train2017/"
dir_list = os.listdir(train_path)

In [43]:
train_image_paths = dir_list[:] 

In [48]:
train_descriptors = []

sift = cv2.SIFT_create()

for current_path in train_image_paths:
    current_image = cv2.imread(train_path + current_path, 0)
    
    keypoints, descriptors = sift.detectAndCompute(current_image, None)

    for i in descriptors:
        train_descriptors.append(i)


In [36]:
random.shuffle(train_descriptors)

In [50]:
model = KMeans(n_clusters=512)
model.fit(train_descriptors)

  super()._check_params_vs_input(X, default_n_init=10)


In [55]:
dump(model, 'search_trained.joblib') 

['search_trained.joblib']

In [57]:
loaded_model = load('search_trained.joblib') 

In [317]:
def tokenize(image):
    keypoints, descriptors = sift.detectAndCompute(image, None)
    tokens = []

    if (descriptors is not None):
        tokens = loaded_model.predict(descriptors.tolist())

    return tokens

def normalize_tokens(tokens):
    hist = [0 for i in range(512)]
    for i in tokens:
        hist[i] += 1

    return [float(i)/(max(hist) + 0.000001) for i in hist]


In [119]:
# plt.hist(normalized_tokens, 512)
# plt.grid(True)
# plt.xlim(0, 512)
# plt.ylim(0, 1)
# plt.show()

In [319]:
data = []

database = "./JPEGImages/"
database_image_paths = os.listdir(database)

for current_path in database_image_paths[:]:
    current_image = cv2.imread(database + current_path, 0)
    
    tokenized_image = tokenize(current_image)
    normalized_tokens = normalize_tokens(tokenized_image)

    data.append([database + current_path, normalized_tokens])

data = pd.DataFrame(data, columns=["pathfile", "vectorized_image"])
data

Unnamed: 0,pathfile,vectorized_image
0,./JPEGImages/2007_000027.jpg,"[0.21052630470914185, 0.05263157617728546, 0.0..."
1,./JPEGImages/2007_000032.jpg,"[0.0, 0.0, 0.23076921301775286, 0.0, 0.0, 0.07..."
2,./JPEGImages/2007_000033.jpg,"[0.0, 0.14285713265306196, 0.14285713265306196..."
3,./JPEGImages/2007_000039.jpg,"[0.0, 0.08333332638888948, 0.08333332638888948..."
4,./JPEGImages/2007_000042.jpg,"[0.03846153698224858, 0.0, 0.03846153698224858..."
...,...,...
17120,./JPEGImages/2012_004326.jpg,"[0.07142856632653098, 0.0, 0.07142856632653098..."
17121,./JPEGImages/2012_004328.jpg,"[0.1666666574074079, 0.0, 0.1666666574074079, ..."
17122,./JPEGImages/2012_004329.jpg,"[0.0, 0.0, 0.19999998000000202, 0.0, 0.0999999..."
17123,./JPEGImages/2012_004330.jpg,"[0.19999998000000202, 0.09999999000000101, 0.1..."


In [320]:
df_images = pd.DataFrame(data, columns=["pathfile", "vectorized_image"])

In [321]:
df_images.to_csv("images_base.csv", sep='|', encoding='utf-8')

In [322]:
df_images = pd.read_csv("images_base.csv", sep='|')[["pathfile", "vectorized_image"]]

In [323]:
import json
df_images["vectorized_image"] = df_images.vectorized_image.apply(lambda t: json.loads(t))

In [324]:
df_images["vectorized_image"] = [np.array(i).tolist() for i in df_images["vectorized_image"]]

In [325]:
df_images.loc[:, "vectorized_image"].values

array([list([0.21052630470914185, 0.05263157617728546, 0.05263157617728546, 0.5789473379501401, 0.21052630470914185, 0.05263157617728546, 0.21052630470914185, 0.26315788088642733, 0.15789472853185638, 0.05263157617728546, 0.05263157617728546, 0.36842103324099823, 0.15789472853185638, 0.26315788088642733, 0.36842103324099823, 0.10526315235457093, 0.15789472853185638, 0.5789473379501401, 0.05263157617728546, 0.15789472853185638, 0.0, 0.10526315235457093, 0.5789473379501401, 0.15789472853185638, 0.10526315235457093, 0.31578945706371275, 0.21052630470914185, 0.26315788088642733, 0.05263157617728546, 0.05263157617728546, 0.36842103324099823, 0.26315788088642733, 0.05263157617728546, 0.15789472853185638, 0.10526315235457093, 0.31578945706371275, 0.21052630470914185, 0.10526315235457093, 0.21052630470914185, 0.0, 0.31578945706371275, 0.26315788088642733, 0.21052630470914185, 0.31578945706371275, 0.26315788088642733, 0.4210526094182837, 0.4210526094182837, 0.15789472853185638, 0.26315788088642

In [326]:
x = df_images.loc[:, "vectorized_image"].values.tolist()

In [330]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=11).fit(x)

In [331]:
distances, indices = nbrs.kneighbors(x[:5])

In [332]:
for i in x[:5]:
    distances, indices = nbrs.kneighbors([i])
    k = 0
    for j in indices[0]:
        current_image = cv2.imread(df_images.loc[j, "pathfile"])
        print(df_images.loc[j, "pathfile"])
        print(current_image)
        cv2.imshow(str(k), current_image) 
        k+=1
    cv2.waitKey(0) 
    cv2.destroyAllWindows()

./JPEGImages/2007_000027.jpg
[[[208 220 208]
  [206 218 206]
  [205 215 203]
  ...
  [ 72  58  82]
  [ 56  43  65]
  [175 164 184]]

 [[210 222 210]
  [209 221 209]
  [210 220 208]
  ...
  [ 72  68  93]
  [ 79  77  97]
  [175 173 192]]

 [[209 220 210]
  [211 222 212]
  [210 221 211]
  ...
  [ 91 112 133]
  [101 119 136]
  [173 189 205]]

 ...

 [[ 89  97 156]
  [ 86  95 152]
  [ 80  87 144]
  ...
  [ 51  32  47]
  [ 69  53  64]
  [245 232 240]]

 [[ 86  97 155]
  [ 87  96 153]
  [ 84  91 148]
  ...
  [ 56  37  52]
  [ 71  55  66]
  [243 230 238]]

 [[ 85  96 154]
  [ 87  98 155]
  [ 85  92 149]
  ...
  [ 69  50  65]
  [ 81  65  76]
  [250 237 245]]]
./JPEGImages/2009_001450.jpg
[[[146 137 140]
  [238 238 238]
  [253 255 255]
  ...
  [174 165 162]
  [188 177 173]
  [183 173 166]]

 [[139 124 128]
  [234 229 231]
  [254 254 254]
  ...
  [192 183 180]
  [226 215 211]
  [211 199 195]]

 [[147 124 132]
  [212 201 204]
  [255 254 255]
  ...
  [157 150 147]
  [175 163 161]
  [224 211 209]]

