# Creating vector representations of images

## Install and import the required libraries

In [None]:
!pip install sentence_transformers elasticsearch

In [None]:
import getpass
import torch
import os
import torchvision.transforms as transforms
import json
from PIL import Image
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch, helpers

## Download sample photos

In [None]:
!curl -LJO https://raw.githubusercontent.com/PacktPublishing/Vector-Search-with-Elastic/main/chapter5/images/images.tar
!tar xvf /content/images.tar

In [None]:
# Set the directory containing your images
image_dir = '/content/images/index'

# set index name
index_name = 'images_book_demo'

# Elasticsearch connection setup
es_cloud_id = getpass.getpass('Enter Elastic Cloud ID:  ')
es_api_key = getpass.getpass('Enter cluster API key:  ')

es = Elasticsearch(cloud_id=es_cloud_id,
                   api_key=es_api_key
                   )
es.info() # should return cluster info


In [None]:
# Download and load the image model
model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')

# Prepare the image transformation function
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    lambda image: image.convert("RGB"),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

In [None]:
def create_mapping_if_new(index_name, es):

    # Define the mapping
    mapping = {
      "mappings": {
        "properties": {
          "image_vector": {
            "type": "dense_vector",
            "dims": 512,
            "index": True,
            "similarity": "cosine"
          } ,
          "filename": {
            "type": "keyword"
          }
      }
    }
  }

    # Check if the index does not exist
    if not es.indices.exists(index=index_name):
        # Create the index with the defined mapping
        es.indices.create(index=index_name, body=mapping)

def embed_image(image_path):
    # Open the image file
    with Image.open(image_path) as img:
        # Apply the transformations to the image
        image = transform(img).unsqueeze(0)

        # If a GPU is available, move the image to the GPU
        if torch.cuda.is_available():
            image = image.to('cuda')
            model.to('cuda')

        # Generate the image vector using the model
        image_vector = model.encode(image)

        # Check if it's a torch tensor and move to CPU if so
        if isinstance(image_vector, torch.Tensor):
            image_vector = image_vector.cpu().numpy()

        # Convert to list
        image_vector = image_vector.tolist()

        # Return the image vector
        return image_vector


In [None]:
# Create new Index with correct mapping if index does not exist
create_mapping_if_new(index_name, es)

# Initialize a dictionary to hold the image filename and vector
data = {}

# For each image file in the directory
for image_file in os.listdir(image_dir):
    # Get image vector
    image_vector = embed_image(os.path.join(image_dir, image_file))

    # Store it in the data dictionary
    data[image_file] = image_vector[0]

# Index the image vectors to Elasticsearch
documents = []
for filename, vector in data.items():

    # Create document
    document = {'_index': index_name,
                '_source': {"filename": filename,
                            "image_vector": vector
                    }
          }


    documents.append(document)

#documents


In [None]:
from elasticsearch.helpers import BulkIndexError

# Index document
try:
  helpers.bulk(es, documents)
except BulkIndexError as e:
  for x in e.errors:
    print(x)

# kNN Search

Generate a vector for the search image

In [None]:
search_image = '/content/images/search/patrice-bouchard-Yu_ejF2s_dI-unsplash.jpg'
search_image_vector = embed_image(search_image)

Perform a kNN vector search

In [None]:
knn = {
    "field": "image_vector",
    "query_vector": search_image_vector[0],
    "k": 1,
    "num_candidates": 10
  }
fields = ["filename"]
size = 1
source = False

In [None]:
results = es.search(index=index_name,
                    knn=knn,
                    source=source,
                    fields=fields,
                    size=size
                  )


In [None]:
result_filename = results['hits']['hits'][0]['fields']['filename'][0]

## Display top hit

In [None]:
from IPython.display import Image
Image('/content/images/index/'+result_filename, width=400)
