In [24]:
import os

# from azure.cognitiveservices.vision.computervision import ComputerVisionClient
# from msrest.authentication import CognitiveServicesCredentials
import requests
import base64
from openai import AzureOpenAI
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,
)
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    VectorSearchAlgorithmMetric,
)


In [38]:
load_dotenv(override=True)

# Azure Storage account details
# storage_account_name = os.environ["storage_account_name"]
# storage_account_key = os.environ["storage_account_key"]
# storage_container_name = os.environ["storage_container_name"]
##########################################################################################################
# Azure AI VISION Embeddings
# ai_vision_subscription_key = os.environ["ai_vision_subscription_key"]
# ai_vision_endpoint = os.environ["ai_vision_endpoint"]
# ###########################################################################################################
# api_version = os.environ["api_version"]
# acv_endpoint = ai_vision_endpoint
# acv_key = ai_vision_subscription_key
#############################################################################################################
# Configure environment variables
AZURE_SEARCH_SERVICE_ENDPOINT = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
AZURE_SEARCH_INDEX_NAME = os.environ["AZURE_SEARCH_INDEX_NAME"]
AZURE_SEARCH_SERVICE_KEY = os.environ["AZURE_SEARCH_SERVICE_KEY"]
azure_search_credential = AzureKeyCredential(AZURE_SEARCH_SERVICE_KEY)
############################################################################################################
# Azure OpenAI configuration
AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME = os.environ[
    "AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME"
]
AZURE_OPENAI_GPT_MODEL_DEPLOYMENT_NAME = os.environ[
    "AZURE_OPENAI_GPT_MODEL_DEPLOYMENT_NAME"
]
AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
AZURE_OPENAI_API_VERSION = os.environ["AZURE_OPENAI_API_VERSION"]
AZURE_OPENAI_API_KEY = os.environ["AZURE_OPENAI_API_KEY"]

client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_API_VERSION,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
)


# ###########################################################################################
# Create an SDK client
admin_client = SearchIndexClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
    index_name=AZURE_SEARCH_INDEX_NAME,
    credential=azure_search_credential,
)

search_client = SearchClient(
    endpoint=AZURE_SEARCH_SERVICE_ENDPOINT,
    index_name=AZURE_SEARCH_INDEX_NAME,
    credential=azure_search_credential,
)


In [26]:
# openai text-embedding-3-small
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings_openai(text, client):
    embeddings = (
        client.embeddings.create(
            input=[text], model=AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME
        )
        .data[0]
        .embedding
    )
    return embeddings

In [35]:
def create_search_index_in_azure_ai_search():
    fields = [
        SimpleField(name="Id", type=SearchFieldDataType.String, key=True),
        SearchableField(
            name="filename",
            type=SearchFieldDataType.String,
            searchable=True,
            filterable=True,
            retrievable=True,
        ),
        SimpleField(name="element_id", type=SearchFieldDataType.String),
        SearchableField(
            name="chunk",
            type=SearchFieldDataType.String,
            searchable=True,
            filterable=True,
            retrievable=True,
        ),
        SimpleField(name="page_number", type=SearchFieldDataType.String),
        SimpleField(
            name="images_info", type=SearchFieldDataType.String
        ),  # [{"image_associated_text","image_base64", "image_discription", "images_link"}]
        SearchField(
            name="chunk_vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile_name="myHnswProfile",
        ),
    ]

    # Configure the vector search configuration
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="myHnsw",
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE,
                ),
            ),
            ExhaustiveKnnAlgorithmConfiguration(
                name="myExhaustiveKnn",
                kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
                parameters=ExhaustiveKnnParameters(
                    metric=VectorSearchAlgorithmMetric.COSINE
                ),
            ),
        ],
        profiles=[
            VectorSearchProfile(
                name="myHnswProfile",
                algorithm_configuration_name="myHnsw",
            ),
            VectorSearchProfile(
                name="myExhaustiveKnnProfile",
                algorithm_configuration_name="myExhaustiveKnn",
            ),
        ],
    )

    semantic_config = SemanticConfiguration(
        name="my-semantic-config",
        prioritized_fields=SemanticPrioritizedFields(
            # title_field=SemanticField(field_name="Image_Name"),
            title_field=SemanticField(field_name="filename"),
            # keywords_fields=[SemanticField(field_name="Image_URL")],
            # content_fields=[SemanticField(field_name="text")]
            content_fields=[SemanticField(field_name="chunk")],
        ),
    )

    # Create the semantic settings with the configuration
    semantic_search = SemanticSearch(configurations=[semantic_config])

    # Create the search index with the semantic settings
    index = SearchIndex(
        name=AZURE_SEARCH_INDEX_NAME,
        fields=fields,
        vector_search=vector_search,
        semantic_search=semantic_search,
    )
    result = admin_client.create_or_update_index(index)
    print(f" {result.name} created")


###########################################################################################################

In [36]:
create_search_index_in_azure_ai_search()

 multimodal-rag created


## Extract the data

Extract the elements of the PDF that we will be able to use in the retrieval process. These elements can be: Text, Images, Tables, etc.

### Partition PDF tables, text, and images

In [3]:
from unstructured.partition.pdf import partition_pdf
# from unstructured.partition.auto import partition

output_path = "images"
file_path = "Attention.pdf"
# The Foundation of Foundation Models.pdf

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,  # extract tables
    strategy="hi_res",  # mandatory to infer tables
    extract_image_block_types=[
        "Image"
    ],  # Add 'Table' to list to extract image of tables
    # image_output_dir_path=output_path,  # if None, images and tables will saved in base64
    extract_image_block_to_payload=True,  # if true, will extract base64 for API usage
    chunking_strategy="by_title",  # or 'basic', by_page - api
    max_characters=10000,  # defaults to 500
    combine_text_under_n_chars=2000,  # defaults to 0
    new_after_n_chars=6000,
    # extract_images_in_pdf=True,          # deprecated
)


unstructured_inference is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with another strategy.
Falling back to partitioning with ocr_only.


TesseractError: (1, 'Error opening data file C:\\Users\\sanket\\AppData\\Local\\Programs\\Tesseract-OCR/tessdata/\'eng\'.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'\'eng\'\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')

In [None]:
from unstructured.partition.auto import partition


filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
elements = partition(filename=filename, content_type="application/pdf")
print("\n\n".join([str(el) for el in elements][:10]))

In [2]:
len(chunks)

12

In [8]:
chunks[7].to_dict()

{'type': 'CompositeElement',
 'element_id': 'a24e443c8395ba0e939a6d54162afbba',
 'text': '5 Training\n\nThis section describes the training regime for our models.\n\n5.1 Training Data and Batching\n\nWe trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding [3], which has a shared source- target vocabulary of about 37000 tokens. For English-French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary [38]. Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.\n\n5.2 Hardware and Schedule\n\nWe trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using the hyperparameters described throughout the paper, each training step took about 0.4 se

In [6]:
chunks[0].to_dict()["metadata"]["filename"]

'Attention.pdf'

In [3]:
# We get 2 types of elements from the partition_pdf function
set([str(type(el)) for el in chunks])

{"<class 'unstructured.documents.elements.CompositeElement'>"}

In [15]:
# Each CompositeElement containes a bunch of related elements.
# This makes it easy to use these elements together in a RAG pipeline.
chunks[3].metadata.orig_elements  # [-3].to_dict()

[<unstructured.documents.elements.Title at 0x1ff880617c0>,
 <unstructured.documents.elements.NarrativeText at 0x1ff88061c10>,
 <unstructured.documents.elements.Footer at 0x1ff88061c40>,
 <unstructured.documents.elements.Image at 0x1ff88061d90>,
 <unstructured.documents.elements.Image at 0x1ff88061e50>,
 <unstructured.documents.elements.NarrativeText at 0x1ff88061f10>,
 <unstructured.documents.elements.NarrativeText at 0x1ff88061b50>,
 <unstructured.documents.elements.Title at 0x1ff88061e20>,
 <unstructured.documents.elements.NarrativeText at 0x1ff88062210>,
 <unstructured.documents.elements.NarrativeText at 0x1ff88062240>,
 <unstructured.documents.elements.Formula at 0x1ff88062030>,
 <unstructured.documents.elements.NarrativeText at 0x1ff880623c0>,
 <unstructured.documents.elements.NarrativeText at 0x1ff880622a0>]

In [13]:
if "Table" in (str(chunks[7].metadata.orig_elements)):
    print("YES")

YES


In [43]:
# This is what an extracted image looks like.
# It contains the base64 representation only because we set the param extract_image_block_to_payload=True

elements = chunks[11].metadata.orig_elements
chunk_images = [el.to_dict() for el in elements if "Image" in str(type(el))]
print(len(chunk_images))
chunk_images[1]

4


{'type': 'Image',
 'element_id': '8cd47c72-2011-493f-98bd-ab7c79d75a71',
 'text': '<ped> <ped> UOIUIdO == Aw ul Bulssiw ale « aM = yeum = S| sy ysnf pinoys = uoluldo Aw ul Bulssiw ae ysnf 38q Pinoys uojeojdde Ss}! nq poped 38q JaAou Me] au <ped> <SOa> uojuido Aw ul Bulssiuw oe aM yeum S| SIU} ysnf 3q Pinoys uojeodde Ss}! ynq yoped 3q 4eAeuU meq auL <ped> <SOa> uo|uldo Aw ul Bulssiuw oe eM yeum S| Siu} ysnf 3q Pinoys uoyeoydde si! ynq yoped 3q aul',
 'metadata': {'detection_class_prob': 0.8002364039421082,
  'coordinates': {'points': ((339.7367858886719, 416.0281677246094),
    (339.7367858886719, 1635.7763671875),
    (1386.38525390625, 1635.7763671875),
    (1386.38525390625, 416.0281677246094)),
   'system': 'PixelSpace',
   'layout_width': 1700,
   'layout_height': 2200},
  'last_modified': '2024-12-12T16:33:06',
  'filetype': 'PPM',
  'languages': ['eng'],
  'page_number': 14,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICI

In [None]:
[
    {
        "image_associated_text": i["text"],
        "image_base64": i["metadata"]["image_base64"],
        "images_link": "",
    }
    for i in chunk_images
]

[{'image_associated_text': 'Scaled Dot-Product Attention  Multi-Head Attention ',
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCAHYA4UDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD0KHxFq3ifxxrmhaVqMGmW2iiJZHMAlluJHBJwGOFRcY6ZOeorS8O3/iQeKdV0nxA1nJF

In [None]:
import base64
from IPython.display import Image, display


def display_base64_image(base64_code):
    # Decode the base64 string to binary
    image_data = base64.b64decode(base64_code)
    # Display the image
    display(Image(data=image_data))


display_base64_image(chunk_images[0].to_dict()["metadata"]["image_base64"])

In [40]:
for id, chunk in enumerate(chunks):
    chunk_dict = chunk.to_dict()
    elements = chunk.metadata.orig_elements
    chunk_images = [el.to_dict() for el in elements if "Image" in str(type(el))]
    item = {}
    item["Id"] = str(id)
    item["filename"] = chunk_dict["metadata"]["filename"]
    item["element_id"] = chunk_dict["element_id"]
    if "Table" in str(elements):
        item["chunk"] = (
            chunk_dict["text"]
            + "Table in Html format :"
            + chunk_dict["metadata"]["text_as_html"]
        )
    else:
        item["chunk"] = chunk_dict["text"]
    item["page_number"] = str(chunk_dict["metadata"]["page_number"])
    if chunk_images:
        item["images_info"] = str(
            [
                {
                    "image_associated_text": i["text"],
                    "image_base64": i["metadata"]["image_base64"],
                    "images_link": "",
                }
                for i in chunk_images
            ]
        )
    item["chunk_vector"] = generate_embeddings_openai(item["chunk"], client)

search_client.upload_documents([item])

[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x1ff8e78acc0>]