# Поиск похожих изображений по картинке

In [51]:
import cv2
import torch
import torchvision.models as models
from torchvision.models import VGG16_Weights, Inception_V3_Weights
from transformers import AutoImageProcessor, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

IMAGES_FOLDER = "images_19206"

INCEPTION_OPTION = "INCEPTION MODEL EMBEDDINGS"
DINO_OPTION = "DINO MODEL EMBEDDINGS"
VGG_OPTION = "VGG-16 MODEL EMBEDDINGS"
HOG_OPTION = "HISTOGRAM OF GRADIENTS EMBEDDINGS"
COLOR_HIST_OPTION = "COLOR HISTOGRAM EMBEDDINGS"
SIFT_OPTION = "SIFT EMBEDDINGS"

In [52]:
dino_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-small')
dino_model = AutoModel.from_pretrained('facebook/dinov2-small').to(device)

In [53]:
vgg_model = models.vgg16(weights=VGG16_Weights.DEFAULT)
vgg_model.classifier = vgg_model.classifier[0]
vgg_model = vgg_model.to(device)

In [None]:
inception_model = models.inception_v3(weights=Inception_V3_Weights.DEFAULT)
inception_model.fc = torch.nn.Identity()
inception_model = inception_model.to(device)

In [55]:
from PIL import Image
import os
import numpy as np
import pandas as pd
import tqdm as tqdm
from torchvision import transforms


def calculate_embeddings(calculate_embedding_method, output_file_name, images_path=IMAGES_FOLDER):
    df = pd.DataFrame(data=None, columns=["img_path", "vector"])
    df_index = 0
    
    bar = tqdm.tqdm(total=len(os.listdir(images_path)))
    for img in os.listdir(images_path):
        img_path = os.path.join(images_path, img)
        img = cv2.imread(img_path)
        
        img_vector = calculate_embedding_method(img)
        
        df.loc[df_index] = [img_path, img_vector]
        df_index += 1
        bar.update(1)
        
    df.to_pickle(output_file_name)

        
def sift_descriptors(img):
    img = cv2.resize(img, (512, 512))
    gray_img = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    
    sift = cv2.SIFT_create()
    _, img_descriptors = sift.detectAndCompute(gray_img, None)
    return img_descriptors


def color_histogram(img):
    img = cv2.resize(img, (512, 512))
    img = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2RGB)
    
    result = []
    colors = ("red", "green", "blue")
    for channel_id, color in enumerate(colors):
        histogram, _ = np.histogram(img[:, :, channel_id], bins=256)
        histogram = histogram / np.linalg.norm(histogram)
        result.extend(histogram)
    return np.array(result)


def hog(img):
    img = cv2.resize(img, (512, 512))
    img = cv2.cvtColor(np.array(img), cv2.COLOR_BGR2GRAY)
    
    sobel_x = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=5)
    sobel_y = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=5)
    g, theta = cv2.cartToPolar(sobel_x, sobel_y)
    hist, _ = np.histogram(theta.flatten(), bins=256, range=(0, 2*np.pi), weights=g.flatten())
    hist = hist / np.linalg.norm(hist)
    
    return hist


def dino_embedding(img):
    img = cv2.resize(img, (512, 512))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    with torch.no_grad():
        inputs = dino_processor(images=img, return_tensors="pt").to(device)
        outputs = dino_model(**inputs)
        
    features = outputs.last_hidden_state
    embedding = features.mean(dim=1).squeeze().cpu().detach().numpy()
    embedding = np.float32(embedding) / np.linalg.norm(embedding)
    
    return embedding


def vgg_16_embedding(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)

    preprocess = transforms.Compose([
        transforms.Resize(214),
        transforms.CenterCrop(214),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    input_tensor = preprocess(img)
    input_tensor = input_tensor.unsqueeze(0)
    input_tensor = input_tensor.to(device)

    with torch.no_grad():
        vgg_model.eval()
        embedding = vgg_model(input_tensor).squeeze().cpu().detach().numpy()

    embedding = np.float32(embedding) / np.linalg.norm(embedding)
    return embedding


def inception_embedding(img):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    
    preprocess = transforms.Compose([
        transforms.Resize(299),
        transforms.CenterCrop(299),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    input_tensor = preprocess(img)
    input_tensor = input_tensor.unsqueeze(0)
    input_tensor = input_tensor.to(device)
    
    with torch.no_grad():
        inception_model.eval()
        embedding = inception_model(input_tensor).squeeze().cpu().detach().numpy()

    embedding = np.float32(embedding) / np.linalg.norm(embedding)
    return embedding

In [56]:
# RUN ONLY 1 TIME:
if not os.path.isdir(f"{IMAGES_FOLDER}_vectors"):
    os.makedirs(f"{IMAGES_FOLDER}_vectors")

# Classic computer vision methods:
# calculate_embeddings(sift_descriptors, f"{IMAGES_FOLDER}_vectors/sift_vectors.pkl")
# calculate_embeddings(color_histogram, f"{IMAGES_FOLDER}_vectors/color_histogram_vectors.pkl")
# calculate_embeddings(hog, f"{IMAGES_FOLDER}_vectors/hog_vectors.pkl")

# Deep learning methods:
calculate_embeddings(dino_embedding, f"{IMAGES_FOLDER}_vectors/dino_vectors.pkl")
calculate_embeddings(vgg_16_embedding, f"{IMAGES_FOLDER}_vectors/vgg_16_vectors.pkl")
calculate_embeddings(inception_embedding, f"{IMAGES_FOLDER}_vectors/inception_vectors.pkl")


  0%|          | 0/29553 [00:00<?, ?it/s][A
  0%|          | 3/29553 [00:00<17:04, 28.85it/s][A
  0%|          | 6/29553 [00:00<19:41, 25.00it/s][A
  0%|          | 10/29553 [00:00<17:17, 28.48it/s][A
  0%|          | 14/29553 [00:00<16:20, 30.12it/s][A
  0%|          | 18/29553 [00:00<15:16, 32.23it/s][A
  0%|          | 22/29553 [00:00<14:18, 34.39it/s][A
  0%|          | 26/29553 [00:00<14:27, 34.03it/s][A
  0%|          | 30/29553 [00:00<14:45, 33.34it/s][A
  0%|          | 34/29553 [00:01<14:07, 34.84it/s][A
  0%|          | 38/29553 [00:01<13:51, 35.50it/s][A
  0%|          | 42/29553 [00:01<13:57, 35.25it/s][A
  0%|          | 46/29553 [00:01<14:05, 34.92it/s][A
  0%|          | 50/29553 [00:01<14:02, 35.00it/s][A
  0%|          | 54/29553 [00:01<14:15, 34.48it/s][A
  0%|          | 59/29553 [00:01<13:31, 36.36it/s][A
  0%|          | 63/29553 [00:01<13:22, 36.74it/s][A
  0%|          | 67/29553 [00:01<13:05, 37.53it/s][A
  0%|          | 71/29553 [00:02<12:54

In [79]:
def find_similar_images(image, method = SIFT_OPTION):
    if method == SIFT_OPTION:
        images_embeddings = pd.read_pickle(f"{IMAGES_FOLDER}_vectors/sift_vectors.pkl")
        query_embedding = sift_descriptors(image)
    elif method == COLOR_HIST_OPTION:
        images_embeddings = pd.read_pickle(f"{IMAGES_FOLDER}_vectors/color_histogram_vectors.pkl")
        query_embedding = color_histogram(image)
    elif method == HOG_OPTION:
        images_embeddings = pd.read_pickle(f"{IMAGES_FOLDER}_vectors/hog_vectors.pkl")
        query_embedding = hog(image)
    elif method == DINO_OPTION:
        images_embeddings = pd.read_pickle(f"{IMAGES_FOLDER}_vectors/dino_vectors.pkl")
        query_embedding = dino_embedding(image)
    elif method == VGG_OPTION:
        images_embeddings = pd.read_pickle(f"{IMAGES_FOLDER}_vectors/vgg_16_vectors.pkl")
        query_embedding = vgg_16_embedding(image)
    elif method == INCEPTION_OPTION:
        images_embeddings = pd.read_pickle(f"{IMAGES_FOLDER}_vectors/inception_vectors.pkl")
        query_embedding = inception_embedding(image)
    else:
        raise ValueError("Unknown method")
        
    all_img_paths = images_embeddings["img_path"]
    all_embeddings = images_embeddings["vector"]
    top1_image_path = ""
    top2_image_path = ""
    top3_image_path = ""
    
    # Keypoint matching:
    if method == SIFT_OPTION:
        bf = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
        top1_matches = 0
        top2_matches = 0
        top3_matches = 0
        for i, vector in enumerate(all_embeddings):
            matches = bf.match(query_embedding, vector)
            num_matches = len(matches)
            if num_matches > top1_matches:
                top3_matches = top2_matches
                top2_matches = top1_matches
                top1_matches = num_matches
                top3_image_path = top2_image_path
                top2_image_path = top1_image_path
                top1_image_path = all_img_paths[i]
            elif num_matches > top2_matches:
                top3_matches = top2_matches
                top2_matches = num_matches
                top3_image_path = top2_image_path
                top2_image_path = all_img_paths[i]
            elif num_matches > top3_matches:
                top3_matches = num_matches
                top3_image_path = all_img_paths[i]

    else:
        top1_distance = float("inf")
        top2_distance = float("inf")
        top3_distance = float("inf")
        
        # Distance comparison:
        for i, vector in enumerate(all_embeddings):
            if method in [DINO_OPTION, VGG_OPTION, INCEPTION_OPTION]:
                cosine_similarity = np.dot(query_embedding, vector)
                distance = 1 - cosine_similarity
                
                # if cosine similarity is too high, then we consider that images are the same and not interesting
                if method == DINO_OPTION and distance < 0.03:
                    distance = 1
                elif method != DINO_OPTION and distance < 0.1:
                    distance = 1
            else:
                distance = np.linalg.norm(query_embedding - vector)
            if distance < top1_distance:
                top3_distance = top2_distance
                top2_distance = top1_distance
                top1_distance = distance
                top3_image_path = top2_image_path
                top2_image_path = top1_image_path
                top1_image_path = all_img_paths[i]
            elif distance < top2_distance:
                top3_distance = top2_distance
                top2_distance = distance
                top3_image_path = top2_image_path
                top2_image_path = all_img_paths[i]
            elif distance < top3_distance:
                top3_distance = distance
                top3_image_path = all_img_paths[i]

    top1_image = cv2.imread(top1_image_path)
    top1_image = cv2.cvtColor(top1_image, cv2.COLOR_BGR2RGB)
    top2_image = cv2.imread(top2_image_path)
    top2_image = cv2.cvtColor(top2_image, cv2.COLOR_BGR2RGB)
    top3_image = cv2.imread(top3_image_path)
    top3_image = cv2.cvtColor(top3_image, cv2.COLOR_BGR2RGB)
    
    return [top1_image, top2_image, top3_image]

In [None]:
import gradio as gr


def get_similar_images(text, image, method):
    similar_images = find_similar_images(image, method)
    images = []
    
    for image in similar_images:
        images.append(cv2.cvtColor(image, cv2.COLOR_BGR2RGB),)
    
    return similar_images

(gr.Interface(get_similar_images,
              inputs = [
                  "text",
                  "image",
                  gr.Dropdown(choices=
                              [DINO_OPTION, VGG_OPTION, INCEPTION_OPTION, HOG_OPTION, COLOR_HIST_OPTION, SIFT_OPTION])],
              outputs = gr.Gallery(label="Similar Images", columns=3))
 .launch(debug=True))

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


0.03821629285812378
0.04520446062088013
0.24627482891082764
0.06634372472763062
0.16980427503585815
0.2153591513633728
0.16022074222564697
0.16980773210525513
0.18505454063415527


  return F.conv2d(input, weight, bias, self.stride,


0.19394457340240479
0.20968914031982422
0.21604537963867188
