# Calculate Cosine Similarity

### 0. Make necessary imports

In [None]:
from tensorflow.keras.losses import cosine_similarity
import numpy as np
import time
import json

In [None]:
%matplotlib inline

### 1. Load cached encoding(s)

In [None]:
encodings = np.load('encodings.npy')

### 2. Calculate cosine similarity of all the encoding(s) with each other

**NOTE:** *This cell can take more than an hour to execute*

In [None]:
similarity_matrix = []

start = time.time()

for i in range(encodings.shape[0]):
    
    for_this_encoding = []
    
    for j in range(encodings.shape[0]):
        for_this_encoding.append(cosine_similarity(encodings[i], encodings[j]).numpy())
    
    similarity_matrix.append(for_this_encoding)

stop = time.time()

assert similarity_matrix.shape == (encodings.shape[0], encodings.shape[0]), f"Incorrect shape {similarity_matrix.shape}"

print(f'It took {round((stop-start)/60, 2)} minutes to calculate similarity matrix.')

### 3. Save the similarity matrix 

In [None]:
np.save('cosine_similarity_matrix.npy', similarity_matrix)

### 4. Create human readble format of above info (sorted)

In [None]:
# tag each row value with its corresponding image number and data type into python 'float'
# convert the rows into lists
tagged_sim_mat = [[[str(i), float(item)] for i, item in enumerate(list(row))] for row in similarity_matrix]

# sort each list in order of decreasing similarity (-1 being most similar and 0 being least similar)
sorted_tagged_sim_mat = [sorted(row, key=lambda x: x[1]) for row in tagged_sim_mat]

### 5. Save the sorted similarity matrix in JSON format

In [None]:
with open('sim_mat_sorted.json', 'w') as file:
    json.dump(sorted_tagged_sim_mat, file)