In [2]:
!pip install annoy

Collecting annoy
[?25l  Downloading https://files.pythonhosted.org/packages/00/15/5a9db225ebda93a235aebd5e42bbf83ab7035e7e4783c6cb528c635c9afb/annoy-1.16.3.tar.gz (644kB)
[K     |▌                               | 10kB 16.3MB/s eta 0:00:01[K     |█                               | 20kB 3.5MB/s eta 0:00:01[K     |█▌                              | 30kB 4.2MB/s eta 0:00:01[K     |██                              | 40kB 4.7MB/s eta 0:00:01[K     |██▌                             | 51kB 4.1MB/s eta 0:00:01[K     |███                             | 61kB 4.4MB/s eta 0:00:01[K     |███▋                            | 71kB 4.9MB/s eta 0:00:01[K     |████                            | 81kB 5.3MB/s eta 0:00:01[K     |████▋                           | 92kB 5.7MB/s eta 0:00:01[K     |█████                           | 102kB 5.3MB/s eta 0:00:01[K     |█████▋                          | 112kB 5.3MB/s eta 0:00:01[K     |██████                          | 122kB 5.3MB/s eta 0:00:01[K    

In [3]:
#################################################
# This script reads image feature vectors from a folder
# and saves the image similarity scores in json file
# by Erdem Isbilen - December/2019
#################################################

#################################################
# Imports and function definitions
#################################################

# Numpy for loading image feature vectors from file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path

# json for storing data in json file
import json

# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial
#################################################


In [5]:
#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched 
# So it is used to find product id based on the product image name
#################################################
def match_id(filename):
  with open("/content/drive/My Drive/Algorithms and Digital Marketing/Assignment3/fin.json") as json_file:
    
    for file in json_file:
        seen = json.loads(file)

        for line in seen:
          
          if filename==line['imageName']:
            
            print(line)
            return line['productId']
            break
#################################################


In [6]:
#################################################
# This function; 
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():

  start_time = time.time()
  
  print("---------------------------------")
  print ("Step.1 - ANNOY index generation - Started at %s" %time.ctime())
  print("---------------------------------")

In [7]:
# Defining data structures as empty dict
file_index_to_file_name = {}
file_index_to_file_vector = {}
file_index_to_product_id = {}

In [8]:
  # Configuring annoy parameters
  dims = 1792
  n_nearest_neighbors = 20
  trees = 10000

In [9]:
  # Reads all file names which stores feature vectors 
  allfiles = glob.glob('/content/drive/My Drive/Algorithms and Digital Marketing/Assignment3/Vectors/*.npz')

In [10]:
t = AnnoyIndex(dims, metric='angular')

In [11]:
for file_index, i in enumerate(allfiles):
    
    start_time = time.time()
    # Reads feature vectors and assigns them into the file_vector 
    file_vector = np.loadtxt(i)

    # Assigns file_name, feature_vectors and corresponding product_id
    file_name = os.path.basename(i).split('.')[0]
    file_index_to_file_name[file_index] = file_name
    file_index_to_file_vector[file_index] = file_vector
    file_index_to_product_id[file_index] = match_id(file_name)

    # Adds image feature vectors into annoy index   
    t.add_item(file_index, file_vector)

    print("---------------------------------")
    print("Annoy index     : %s" %file_index)
    print("Image file name : %s" %file_name)
    print("Product id      : %s" %file_index_to_product_id[file_index])
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

In [None]:
  # Builds annoy index
  t.build(trees)

  print ("Step.1 - ANNOY index generation - Finished")
  print ("Step.2 - Similarity score calculation - Started ") 
  
  named_nearest_neighbors = []

  # Loops through all indexed items
  for i in file_index_to_file_name.keys():

    # Assigns master file_name, image feature vectors and product id values
    master_file_name = file_index_to_file_name[i]
    master_vector = file_index_to_file_vector[i]
    master_product_id = file_index_to_product_id[i]

    # Calculates the nearest neighbors of the master item
    nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)

    # Loops through the nearest neighbors of the master item
    for j in nearest_neighbors:

      print(j)

      # Assigns file_name, image feature vectors and product id values of the similar item
      neighbor_file_name = file_index_to_file_name[j]
      neighbor_file_vector = file_index_to_file_vector[j]
      neighbor_product_id = file_index_to_product_id[j]

      # Calculates the similarity score of the similar item
      similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
      rounded_similarity = int((similarity * 10000)) / 10000.0

      # Appends master product id with the similarity score 
      # and the product id of the similar items
      named_nearest_neighbors.append({
        'similarity': rounded_similarity,
        'master_pi': master_product_id,
        'similar_pi': neighbor_product_id})

    print("---------------------------------") 
    print("Similarity index       : %s" %i)
    print("Master Image file name : %s" %file_index_to_file_name[i]) 
    print("Nearest Neighbors.     : %s" %nearest_neighbors) 
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  
  print ("Step.2 - Similarity score calculation - Finished ") 

  # Writes the 'named_nearest_neighbors' to a json file
  with open('nearest_neighbors.json', 'w') as out:
    json.dump(named_nearest_neighbors, out)

  print ("Step.3 - Data stored in 'nearest_neighbors.json' file ") 
  print("--- Prosess completed in %.2f minutes ---------" % ((time.time() - start_time)/60))

cluster()


Step.1 - ANNOY index generation - Finished
Step.2 - Similarity score calculation - Started 
0
84
87
40
75
17
22
26
54
60
47
56
96
18
1
104
86
102
31
27
---------------------------------
Similarity index       : 0
Master Image file name : 0_0
Nearest Neighbors.     : [0, 84, 87, 40, 75, 17, 22, 26, 54, 60, 47, 56, 96, 18, 1, 104, 86, 102, 31, 27]
--- 0.78 minutes passed ---------
1
87
95
84
75
102
0
96
47
36
14
41
26
22
18
109
100
107
54
60
---------------------------------
Similarity index       : 1
Master Image file name : 48_0
Nearest Neighbors.     : [1, 87, 95, 84, 75, 102, 0, 96, 47, 36, 14, 41, 26, 22, 18, 109, 100, 107, 54, 60]
--- 0.78 minutes passed ---------
2
6
31
86
89
21
28
81
48
16
0
66
67
104
12
34
17
53
56
79
---------------------------------
Similarity index       : 2
Master Image file name : 43_1
Nearest Neighbors.     : [2, 6, 31, 86, 89, 21, 28, 81, 48, 16, 0, 66, 67, 104, 12, 34, 17, 53, 56, 79]
--- 0.78 minutes passed ---------
3
24
25
59
92
44
45
21
28
19
86
71
9