In [33]:
# Import required libraries  
import os  
import json  
from openai import AzureOpenAI, DefaultHttpxClient
from dotenv import load_dotenv, find_dotenv
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
)
from pathlib import Path

In [38]:
# Configure environment variables  
load_dotenv(find_dotenv('credential.env'), override=True)

# Azure AI Search
service_endpoint = os.environ['AZURE_AI_SEARCH_ENDPOINT']
key = os.environ['AZURE_AI_SEARCH_KEY']
index_name = os.environ['AZURE_AI_SEARCH_INDEX_NAME']
credential = AzureKeyCredential(key)

#Azure OpenAI
client = AzureOpenAI(
  api_key = os.environ['AZURE_OPENAI_API_KEY'],  # this is also the default, it can be omitted
  azure_endpoint = os.environ['AZURE_OPENAI_API_ENDPOINT'],
  api_version = os.environ['AZURE_OPENAI_API_VERSION'],
  http_client = DefaultHttpxClient(verify=False)
)
embedding_model = os.environ['EMBEDDING_MODEL_NAME']

In [35]:
# Declare useful method
def check_and_create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")

def print_error_message(message, prefix_message='Error: '):
    print(f"\033[1;31m{prefix_message}\033[0m{message}")

def print_warning_message(message, prefix_message='Warning: '):
    print(f"\033[1;33m{prefix_message}\033[0m{message}")
    
def print_success_message(message, prefix_message='Success: '):
    print(f"\033[1;32m{prefix_message}\033[0m{message}")

In [36]:
def generate_embeddings(text):
    response = client.embeddings.create(input=text, model=embedding_model)
    return response.data[0].embedding

In [37]:
print_warning_message("Generate embeddings of chunked document", ">>>[STEP4] ")

# Set the local folder name for document intelligence output
# Check if the folder exists
check_and_create_folder("data/chunked_document_vector")

# Create embeddings on field "Content" using Azure OpenAI embedding model        
for file in Path("data/chunked_document").glob("*.json"):
    input_data = json.loads(file.read_text(encoding='utf-8'))
    content = input_data['content']
    content_embeddings = generate_embeddings(content)
    input_data['contentVector'] = content_embeddings
    output_file_path = f"data/chunked_document_vector/{file.name}"
    with open(output_file_path, "w", encoding='utf-8') as f:
        json.dump(input_data, f, ensure_ascii=False, indent=4)
    print_success_message(f'Embedding chunked document {file.name}')

[1;33m>>>[STEP4] [0mGenerate embeddings of chunked document
The folder 'data/chunked_document_vector' already exists.
[1;32mSuccess: [0mEmbedding chunked document mobil-super-moto-20w40_pdf-0.json
[1;32mSuccess: [0mEmbedding chunked document choosing-the-right-oil-for-your-car_pdf-0.json
[1;32mSuccess: [0mEmbedding chunked document mobil-1_pdf-0.json
[1;32mSuccess: [0mEmbedding chunked document suzuki_pdf-1.json
[1;32mSuccess: [0mEmbedding chunked document mobil-1-turbo-diesel-pickup-5w40_pdf-0.json
[1;32mSuccess: [0mEmbedding chunked document mobil-super_pdf-1.json
[1;32mSuccess: [0mEmbedding chunked document toyota_pdf-1.json
[1;32mSuccess: [0mEmbedding chunked document mobil-super-moto-scooter-gear-oil_pdf-1.json
[1;32mSuccess: [0mEmbedding chunked document mazda_pdf-1.json
[1;32mSuccess: [0mEmbedding chunked document mobil-super-moto-scooter-10w40_pdf-1.json
[1;32mSuccess: [0mEmbedding chunked document product-oem-bmw_pdf-1.json
[1;32mSuccess: [0mEmbedding

In [41]:
print_warning_message("Create Azure AI Search index", ">>>[STEP5] ")

# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="sourcepage", type=SearchFieldDataType.String),
    SearchableField(name="sourcefile", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=PrioritizedFields(
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search,
                    semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print_success_message(f'Index name: "{result.name}" is created')

[1;33m>>>[STEP5] [0mCreate Azure AI Search index
[1;32mSuccess: [0mIndex name: "bootcathon-thanawat-index" is created


In [43]:
print_warning_message("Upload embedded chunk documents to Azure AI Index", ">>>[STEP6] ")

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
        
for file in Path().glob("data/chunked_document_vector/*.json"):
    input_data = json.loads(file.read_text())
    result = search_client.upload_documents(input_data)
    print_success_message(f"Uploaded embedded chunk: {file.name} to {index_name} index.") 

[1;33m>>>[STEP6] [0mUpload embedded chunk documents to Azure AI Index
[1;32mSuccess: [0mUploaded embedded chunk: mobil-super-moto-20w40_pdf-0.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: choosing-the-right-oil-for-your-car_pdf-0.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: mobil-1_pdf-0.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: suzuki_pdf-1.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: mobil-1-turbo-diesel-pickup-5w40_pdf-0.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: mobil-super_pdf-1.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: toyota_pdf-1.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded embedded chunk: mobil-super-moto-scooter-gear-oil_pdf-1.json to bootcathon-thanawat-index index.
[1;32mSuccess: [0mUploaded emb