## 1. Python librairies

In [2]:
import datetime
import io
import json
import math
import os
import requests
import sys
import time

from dotenv import load_dotenv
from io import BytesIO
from PIL import Image

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    PrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection,
    SemanticConfiguration,
    SemanticField,
    SemanticSettings,
    SimpleField,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)
from azure.storage.blob import BlobServiceClient

## 2. Azure AI Services

In [4]:
load_dotenv("azure.env")

# Azure Computer Vision 4
acv_key = os.getenv("acv_key")
acv_endpoint = os.getenv("acv_endpoint")

# Azure Cognitive Search
acs_endpoint = os.getenv("acs_endpoint")
acs_key = os.getenv("acs_key")

blob_connection_string = os.getenv("blob_connection_string")
container_name = os.getenv("container_name")

In [5]:
# Ensure that the azure endpoints should not finished a /
if acv_endpoint.endswith("/"):
    acv_endpoint = acv_endpoint[:-1]

if acs_endpoint.endswith("/"):
    acs_endpoint = acv_endpoint[:-1]

In [6]:
index_name = "fashion-demo"

api_version = "2023-02-01-preview"

## 3. Connect to Blob Storage


In [8]:
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
blobs = container_client.list_blobs()

first_blob = next(blobs)
blob_url = container_client.get_blob_client(first_blob).url

## 4. Connect your Blob Storage to a data source in Azure Cognitive Search

In [10]:
ds_client = SearchIndexerClient(acs_endpoint, AzureKeyCredential(acs_key))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container,
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

## 5. Our fashion images

In [11]:
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
number_images = len(list(container_client.list_blobs()))

In [12]:
def describe_image(image_file):
    options = "&features=tags,caption"
    model = "?api-version=" + api_version + "&modelVersion=latest"
    url = acv_endpoint + "/computervision/imageanalysis:analyze" + model + options

    headers_cv = {
        "Content-type": "application/octet-stream",
        "Ocp-Apim-Subscription-Key": acv_key,
    }

    blob_client = container_client.get_blob_client(image_file)
    blob_image = blob_client.download_blob().readall()

    image_stream = io.BytesIO(blob_image)
    image = Image.open(image_stream)

    image_bytes = io.BytesIO()
    image.save(image_bytes, format="JPEG")
    image_bytes.seek(0)

    response = requests.post(url, data=image_bytes, headers=headers_cv)

    if response.status_code == 200:
        results = response.json()
        print("Automatic analysis of the image using Azure Computer Vision 4.0:")
        print("\033[1;31;34m")
        print("Main caption:")
        caption = results["captionResult"]["text"]
        confidence = results["captionResult"]["confidence"]
        print(f"{caption} = {confidence:.3f}")

        print("\033[1;31;32m")
        print("Detected tags:")
        tags = results["tagsResult"]["values"]

        for tag in tags:
            name = tag["name"]
            confidence = tag["confidence"]
            print(f"{name} = {confidence:.3f}")

    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

In [13]:
def view_image(image_file):
    blob_client = container_client.get_blob_client(image_file)
    blob_image = container_client.get_blob_client(image_file).download_blob().readall()

    image_stream = io.BytesIO(blob_image)

    print("Image:", image_file)
    image = Image.open(image_stream)
    image.thumbnail((640, 640), Image.LANCZOS)
    display(image)

## 6. Azure Computer Vision 4 Florence embeddings functions

### Text Embedding

In [14]:
def text_embedding(prompt):
    version = "?api-version=" + api_version + "&modelVersion=latest"
    vec_txt_url = f"{acv_endpoint}/computervision/retrieval:vectorizeText{version}"
    headers = {"Content-type": "application/json", "Ocp-Apim-Subscription-Key": acv_key}

    payload = {"text": prompt}
    response = requests.post(vec_txt_url, json=payload, headers=headers)

    if response.status_code == 200:
        text_emb = response.json().get("vector")
        return text_emb

    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

### Quick test

In [15]:
query = "Siyah gömlek"

text_emb = text_embedding(query)
print("Size of the vector embeddings =", len(text_emb))

Size of the vector embeddings = 1024


### Image embedding

In [16]:
session = requests.Session()


def image_embedding(imagefile):
    version = "?api-version=" + api_version + "&modelVersion=latest"
    vec_img_url = acv_endpoint + "/computervision/retrieval:vectorizeImage" + version
    headers = {
        "Content-type": "application/octet-stream",
        "Ocp-Apim-Subscription-Key": acv_key,
    }

    try:
        blob_service_client = BlobServiceClient.from_connection_string(
            blob_connection_string
        )
        container_client = blob_service_client.get_container_client(container_name)

        blob_client = container_client.get_blob_client(imagefile)
        stream = BytesIO()
        blob_data = blob_client.download_blob()
        blob_data.readinto(stream)

        stream.seek(0)  

        response = session.post(vec_img_url, data=stream, headers=headers)
        response.raise_for_status()  

        image_emb = response.json()["vector"]
        return image_emb

    except requests.exceptions.RequestException as e:
        print(f"Request Exception: {e}")
    except Exception as ex:
        print(f"Error: {ex}")

    return None

### Quick test

In [17]:
image_file = "3600530941278.jpg"

image_emb1 = image_embedding(image_file)
print("Size of the vector embeddings =", len(image_emb1))

Size of the vector embeddings = 1024


In [18]:
def get_cosine_similarity(vector1, vector2):
    """
    Get cosine similarity value between two embedded vectors
    """
    dot_product = sum(x * y for x, y in zip(vector1, vector2))
    magnitude1 = math.sqrt(sum(x * x for x in vector1))
    magnitude2 = math.sqrt(sum(x * x for x in vector2))
    cosine_similarity = dot_product / (magnitude1 * magnitude2)

    return cosine_similarity

In [19]:
image_file = "3600531396855.jpg"
image_emb2 = image_embedding(image_file)

In [20]:
similarity_score = get_cosine_similarity(image_emb1, image_emb2)
print(f"Cosine similarity = {similarity_score}")

Cosine similarity = 0.5962623697035899


## 7. Generating the vectors embeddings to our catalog images

In [21]:
EMBEDDINGS_DIR = "embeddings"

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

In [22]:
list_of_images = container_client.list_blobs()

images_list = []

for image in list_of_images:
    imagefile = image["name"]
    images_list.append(imagefile)

In [23]:
print("Number of catalog images =", len(images_list))

Number of catalog images = 25880


In [26]:
data = [
    {"idfile": str(i + 1), "imagefile": image} for i, image in enumerate(images_list)
]

with open(os.path.join(EMBEDDINGS_DIR, "list_of_images.json"), "w") as f:
    json.dump(data, f)
    
!ls $EMBEDDINGS_DIR/list_of_images.json -lh

ls: -lh: No such file or directory
embeddings/list_of_images.json


### Running the vectors embeddings for all our catalog images

In [27]:
batch_size = 500

start = time.time()
print("Running the image files embeddings...")
print("Total number of images to embed =", len(images_list), "\n")

with open(
    os.path.join(EMBEDDINGS_DIR, "list_of_images.json"), "r", encoding="utf-8"
) as file:
    input_data = json.load(file)

image_count = len(input_data)
processed_count = 0

for batch_start in range(0, image_count, batch_size):
    batch_end = min(batch_start + batch_size, image_count)
    batch_data = input_data[batch_start:batch_end]

    for idx, item in enumerate(batch_data, start=batch_start + 1):
        imgindex = item["idfile"]
        imgfile = item["imagefile"]
        item["imagevector"] = image_embedding(imgfile)

        if idx % batch_size == 1:
            pctdone = round(idx / image_count * 100)
            dt = datetime.datetime.today().strftime("%d-%b-%Y %H:%M:%S")
            print(
                dt,
                f"Number of processed image files = {idx:06} of {image_count:06} | Done: {pctdone}%",
            )

    processed_count += len(batch_data)

elapsed = time.time() - start
print("\nDone")
print(
    "\nElapsed time: "
    + time.strftime(
        "%H:%M:%S.{}".format(str(elapsed % 1)[2:])[:15], time.gmtime(elapsed)
    )
)
print("Time per image =", round(elapsed / processed_count, 5), "seconds")

Running the image files embeddings...
Total number of images to embed = 25880 

05-Oct-2023 03:44:51 Number of processed image files = 000001 of 025880 | Done: 0%
05-Oct-2023 03:51:23 Number of processed image files = 000501 of 025880 | Done: 2%
05-Oct-2023 03:58:11 Number of processed image files = 001001 of 025880 | Done: 4%
05-Oct-2023 04:04:41 Number of processed image files = 001501 of 025880 | Done: 6%
05-Oct-2023 04:11:12 Number of processed image files = 002001 of 025880 | Done: 8%
05-Oct-2023 04:17:51 Number of processed image files = 002501 of 025880 | Done: 10%
05-Oct-2023 04:24:25 Number of processed image files = 003001 of 025880 | Done: 12%
05-Oct-2023 04:30:53 Number of processed image files = 003501 of 025880 | Done: 14%
05-Oct-2023 04:37:15 Number of processed image files = 004001 of 025880 | Done: 15%
05-Oct-2023 04:43:30 Number of processed image files = 004501 of 025880 | Done: 17%
05-Oct-2023 04:49:49 Number of processed image files = 005001 of 025880 | Done: 19%
0

In [28]:
start = time.time()

print("Saving the results into a json file...")
with open(os.path.join(EMBEDDINGS_DIR, "documents.json"), "w") as f:
    json.dump(input_data, f)

print("Done. Elapsed time:", round(time.time() - start, 2), "secs")

Saving the results into a json file...
Done. Elapsed time: 29.39 secs


In [29]:
!ls $EMBEDDINGS_DIR/documents.json -lh

ls: -lh: No such file or directory
embeddings/documents.json


## 8. Creating the Azure Cognitive Search index and uploading the embeddings

In [30]:
try:
    print("Setting the Azure Cognitive Search client")
    search_client = SearchIndexClient(
        endpoint=acs_endpoint, credential=AzureKeyCredential(acs_key)
    )
    print("Done")
    print(search_client)

except:
    print("Request failed. Cannot create Azure Cognitive Search client:",
          acs_endpoint)

Setting the Azure Cognitive Search client
Done
<azure.search.documents.indexes._search_index_client.SearchIndexClient object at 0x11b1ad850>


In [31]:
def delete_index(index_name):
    """
    Deleting an Azure Cognitive Search index
    """
    start = time.time()
    search_client = SearchIndexClient(
        endpoint=acs_endpoint, credential=AzureKeyCredential(acs_key)
    )
    print("Deleting the Azure Cognitive Search index:", index_name)
    search_client.delete_index(index_name)
    print("Done. Elapsed time:", round(time.time() - start, 2), "secs")

In [32]:
delete_index(index_name)

Deleting the Azure Cognitive Search index: fashion-demo
Done. Elapsed time: 0.66 secs


In [33]:
start = time.time()

# Fields definition
fields = [
    # Image ID
    SimpleField(name="idfile",
                type=SearchFieldDataType.String,
                key=True),
    # Image filename
    SearchableField(
        name="imagefile",
        type=SearchFieldDataType.String,
        searchable=True,
        retrievable=True,
    ),
    # Vector image embeddings
    SearchField(
        name="imagevector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        dimensions=1024,  # Dimension of the Azure CV Florence vector embeddings
        vector_search_configuration="myconfig",
    ),
]


# Vector Search definition
vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="myconfig",
            kind="hnsw",  # hnsw = Hierarchical Navigable Small Worlds
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine",  # Cosine similarity metric
            },
        )
    ]
)

# Semantic config
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="idfile"),
    ),
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(
    name=index_name,
    fields=fields,
    vector_search=vector_search,
    semantic_settings=semantic_settings,
)

# Let's process
response = search_client.create_or_update_index(index)

print(f"Done. Search index {response.name} has been created.")
print("Elapsed time:", round(time.time() - start, 2), "secs")

Done. Search index fashion-demo has been created.
Elapsed time: 1.44 secs


In [34]:
with open(os.path.join(EMBEDDINGS_DIR, "documents.json"), "r") as file:
    documents = json.load(file)

print("Size of the documents to load =", len(documents))

Size of the documents to load = 25880


In [35]:
def loading_documents(documents):
    """
    Loading documents into the Azure Cognitive Search index
    """
    # Upload some documents to the index
    print("Uploading the documents into the index", index_name, "...")

    # Setting the Azure Cognitive Search client
    search_client = SearchClient(
        endpoint=acs_endpoint,
        index_name=index_name,
        credential=AzureKeyCredential(acs_key),
    )
    response = search_client.upload_documents(documents)
    print(
        f"\nDone. Uploaded {len(documents)} documents into the Azure Cognitive Search index.\n"
    )
    return len(documents)

In [36]:
documents = [doc for doc in documents if doc.get('imagevector') is not None]

start = time.time()

idx = 0
total_nb = 0
step = 1
chunk_size = 1000

while idx < len(documents):
    begin = idx
    end = begin + chunk_size
    print(step, "Loading the embeddings from", begin, "to", end)
    # Loading the chunck
    chunk_docs = documents[begin:end]
    total_nb += loading_documents(chunk_docs)
    idx += chunk_size
    step += 1

print("Total number of loaded documents =", total_nb)
print("Elapsed time:", round(time.time() - start, 2), "secs")


1 Loading the embeddings from 0 to 1000
Uploading the documents into the index fashion-demo ...

Done. Uploaded 1000 documents into the Azure Cognitive Search index.

2 Loading the embeddings from 1000 to 2000
Uploading the documents into the index fashion-demo ...

Done. Uploaded 1000 documents into the Azure Cognitive Search index.

3 Loading the embeddings from 2000 to 3000
Uploading the documents into the index fashion-demo ...

Done. Uploaded 1000 documents into the Azure Cognitive Search index.

4 Loading the embeddings from 3000 to 4000
Uploading the documents into the index fashion-demo ...

Done. Uploaded 1000 documents into the Azure Cognitive Search index.

5 Loading the embeddings from 4000 to 5000
Uploading the documents into the index fashion-demo ...

Done. Uploaded 1000 documents into the Azure Cognitive Search index.

6 Loading the embeddings from 5000 to 6000
Uploading the documents into the index fashion-demo ...

Done. Uploaded 1000 documents into the Azure Cognitiv

### Azure Cognitive Search index status

In [37]:
def index_stats(index_name):
    """
    Get statistics about Azure Cognitive Search index
    """  
    url = acs_endpoint + "/indexes/" + index_name + "/stats?api-version=2021-04-30-Preview"
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key,
    }
    response = requests.get(url, headers=headers)
    print("Azure Cognitive Search index status for:", index_name, "\n")

    if response.status_code == 200:
        res = response.json()
        print(json.dumps(res, indent=2))
        document_count = res['documentCount']
        storage_size = res['storageSize']

    else:
        print("Request failed with status code:", response.status_code)
    
    return document_count, storage_size

In [38]:
document_count, storage_size = index_stats(index_name)

Azure Cognitive Search index status for: fashion-demo 

{
  "@odata.context": "https://mono-cognitive-search.search.windows.net/$metadata#Microsoft.Azure.Search.V2021_04_30_Preview.IndexStatistics",
  "documentCount": 14000,
  "storageSize": 213601740
}


> We have loaded the **10226 vectors embeddings into the Azure Cognitive Search index**

In [40]:
def index_status(index_name):
    """
    Azure Cognitive Search index status
    """
    print("Azure Cognitive Search Index:", index_name, "\n")

    headers = {"Content-Type": "application/json",
               "api-key": acs_key}
    params = {"api-version": "2021-04-30-Preview"}
    index_status = requests.get(
        acs_endpoint + "/indexes/" + index_name, headers=headers, params=params
    )
    try:
        print(json.dumps((index_status.json()), indent=5))
    except:
        print("Request failed")

In [41]:
index_status(index_name)

Azure Cognitive Search Index: fashion-demo 

{
     "@odata.context": "https://mono-cognitive-search.search.windows.net/$metadata#indexes/$entity",
     "@odata.etag": "\"0x8DBC56AE6B780AA\"",
     "name": "fashion-demo",
     "defaultScoringProfile": null,
     "fields": [
          {
               "name": "idfile",
               "type": "Edm.String",
               "searchable": false,
               "filterable": false,
               "retrievable": true,
               "sortable": false,
               "facetable": false,
               "key": true,
               "indexAnalyzer": null,
               "searchAnalyzer": null,
               "analyzer": null,
               "normalizer": null,
               "synonymMaps": []
          },
          {
               "name": "imagefile",
               "type": "Edm.String",
               "searchable": true,
               "filterable": false,
               "retrievable": true,
               "sortable": false,
               "facet

### Quick test on a basic text search

In [42]:
search_client = SearchClient(acs_endpoint,
                             index_name,
                             AzureKeyCredential(acs_key))

In [43]:
text = "8683522157586"
print("Let's query the index with text =", text, "\n")

response = search_client.search(search_text=text)

for result in response:
    print("Id file:", result["idfile"])
    print("\nFilename:", result["imagefile"])
    print("\nVector embedding size:", len(result["imagevector"]))
    print("\nVector embedding:", result["imagevector"])

Let's query the index with text = 8683522157586 

Id file: 7066

Filename: 8683522157586.jpg

Vector embedding size: 1024

Vector embedding: [-1.08984375, 0.2915039, 2.57421875, -1.12011719, 1.73339844, 0.818847656, 0.5517578, -2.2578125, -3.91992188, 2.51171875, -2.3046875, -1.27050781, 0.5698242, -0.3947754, -0.881347656, -0.292480469, 1.88183594, 0.08831787, -3.2421875, -1.58007813, -2.79296875, 1.72070313, 1.54785156, -3.421875, -1.2890625, -1.89355469, 1.38085938, 0.5136719, -0.5209961, 2.06835938, 0.0276489258, -2.45507813, 1.4453125, -3.2421875, 4.2265625, -1.50878906, -0.263427734, -1.33007813, 0.2019043, -1.46191406, 1.69140625, 0.0871582, 1.37695313, 3.60351563, 6.19140625, -0.227539063, -0.698730469, -4.28515625, -2.37890625, 0.8208008, 0.344482422, 0.392578125, 2.05078125, 0.6977539, -3.77929688, -0.7705078, 1.93164063, 2.51953125, -3.10742188, 1.1953125, 1.04492188, -4.85546875, -1.5390625, -1.08007813, -1.28125, -1.39550781, 4.3671875, -0.282226563, 0.4765625, 3.1875, -3.