-
Notifications
You must be signed in to change notification settings - Fork 0
/
image detection.py
158 lines (117 loc) · 5.37 KB
/
image detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#################################################
# This script reads image feature vectors from a folder
# and saves the image similarity scores in json file
# by Erdem Isbilen - December/2019
#################################################
#################################################
# Imports and function definitions
#################################################
# Numpy for loading image feature vectors from file
import numpy as np
# Time for measuring the process time
import time
# Glob for reading file names in a folder
import glob
import os.path
# json for storing data in json file
import json
# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial
#################################################
#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched
# So it is used to find product id based on the product image name
#################################################
def match_id(filename):
with open('image_data.json') as json_file:
for file in json_file:
seen = json.loads(file)
for line in seen:
if filename==line['imageName']:
print(line)
return line['productId']
break
#################################################
#################################################
# This function;
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():
start_time = time.time()
print("---------------------------------")
print ("Step.1 - ANNOY index generation - Started at %s" %time.ctime())
print("---------------------------------")
# Defining data structures as empty dict
file_index_to_file_name = {}
file_index_to_file_vector = {}
file_index_to_product_id = {}
# Configuring annoy parameters
dims = 1792
n_nearest_neighbors = 20
trees = 10000
# Reads all file names which stores feature vectors
allfiles = glob.glob('/Users/sophiekieftenbelt/Documents/school/Data Driven Design/Semester 3/State of the Art Technology/Facial detection/Test/*.npz')
t = AnnoyIndex(dims, metric='angular')
for file_index, i in enumerate(allfiles):
# Reads feature vectors and assigns them into the file_vector
file_vector = np.loadtxt(i)
# Assigns file_name, feature_vectors and corresponding product_id
file_name = os.path.basename(i).split('.')[0]
file_index_to_file_name[file_index] = file_name
file_index_to_file_vector[file_index] = file_vector
file_index_to_product_id[file_index] = match_id(file_name)
# Adds image feature vectors into annoy index
t.add_item(file_index, file_vector)
print("---------------------------------")
print("Annoy index : %s" %file_index)
print("Image file name : %s" %file_name)
print("Product id : %s" %file_index_to_product_id[file_index])
print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
# Builds annoy index
t.build(trees)
print ("Step.1 - ANNOY index generation - Finished")
print ("Step.2 - Similarity score calculation - Started ")
named_nearest_neighbors = []
# Loops through all indexed items
for i in file_index_to_file_name.keys():
# Assigns master file_name, image feature vectors and product id values
master_file_name = file_index_to_file_name[i]
master_vector = file_index_to_file_vector[i]
master_product_id = file_index_to_product_id[i]
# Calculates the nearest neighbors of the master item
nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)
# Loops through the nearest neighbors of the master item
for j in nearest_neighbors:
print(j)
# Assigns file_name, image feature vectors and product id values of the similar item
neighbor_file_name = file_index_to_file_name[j]
neighbor_file_vector = file_index_to_file_vector[j]
neighbor_product_id = file_index_to_product_id[j]
# Calculates the similarity score of the similar item
similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
rounded_similarity = int((similarity * 10000)) / 10000.0
# Appends master product id with the similarity score
# and the product id of the similar items
named_nearest_neighbors.append({
'similarity': rounded_similarity,
'master_pi': master_product_id,
'similar_pi': neighbor_product_id})
print("---------------------------------")
print("Similarity index : %s" %i)
print("Master Image file name : %s" %file_index_to_file_name[i])
print("Nearest Neighbors. : %s" %nearest_neighbors)
print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
print ("Step.2 - Similarity score calculation - Finished ")
# Writes the 'named_nearest_neighbors' to a json file
with open('nearest_neighbors.json', 'w') as out:
json.dump(named_nearest_neighbors, out)
print ("Step.3 - Data stored in 'nearest_neighbors.json' file ")
print("--- Prosess completed in %.2f minutes ---------" % ((time.time() - start_time)/60))
cluster()