# Color feature inspection
Let's see if the right match is in top 10 or something.

In [18]:
import os
import cv2
import pickle
import numpy as np
from glob import glob
from sklearn.cluster import MiniBatchKMeans, KMeans
from tqdm.notebook import tqdm
from IPython.display import clear_output

In [4]:
# File color map
with open('../../saved_data/20 Jun/file_color_map.pkl', 'rb') as f:
    file_color_map = pickle.load(f)
    
with open('../../saved_data/20 Jun/kmeans_clt.pkl', 'rb') as f:
    kmeans_clt = pickle.load(f) 

In [19]:
class ProductFinder(object):
    
    def __init__(self, n_colors, kmeans_clt, file_color_map, top_k=3):
        self.n_colors = n_colors
        self.kmeans_clt = kmeans_clt
        self.file_color_map = file_color_map
        self.top_k = top_k
        
    def jaccard_score(self, list_1, list_2):
        union = set(list_1).union(list_2)
        intr = set(list_1).intersection(list_2)
        return len(intr)/len(union)
        
    def find_candidates(self, img):
        img = img.reshape(img.shape[0]*img.shape[1], -1)
        mb_kmeans = MiniBatchKMeans(n_clusters=self.n_colors)
        mb_kmeans.fit(img)
        labels = self.kmeans_clt.predict(mb_kmeans.cluster_centers_)
        js = [self.jaccard_score(labels, v) for v in self.file_color_map.values()]
        N = int(0.01*len(js))
        candidates = sorted(range(len(js)), key=lambda i: js[i])[-N:]  
        candidate_paths = np.array(list(self.file_color_map.keys()))[candidates].tolist()
        return candidates, candidate_paths
    
    def check_presence(self, root_dir):
        found = 0
        num_files = 0
        folders = os.listdir(root_dir)
        folders.remove('test_images')
        for fol in folders:
            for path in tqdm(glob(root_dir+'/'+fol+'/*.jpg')):
                num_files += 1
                img = cv2.imread(path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                cnd, cnd_paths = self.find_candidates(img)
                if path in cnd_paths:
                    found += 1
        
        clear_output()
        found_percent = 100.*found/num_files
        print("Found percent: {:.2f}".format(found_percent))

In [20]:
pf = ProductFinder(n_colors=10, kmeans_clt=kmeans_clt, file_color_map=file_color_map)
pf.check_presence('../../data')

Found percent: 31.25
