## Creating an index in Azure AI Search with embeddings

In [28]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchableField,
    VectorSearch,
    SearchFieldDataType,
    ExhaustiveKnnParameters,
    HnswAlgorithmConfiguration,
    ExhaustiveKnnAlgorithmConfiguration,
    VectorSearchProfile,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    SemanticConfiguration,
    SemanticField,
    SemanticPrioritizedFields,
    SemanticSearch,
    SearchIndex,
    ComplexField,
    SearchField,
    SearchableField
)
from dotenv import load_dotenv
import os
from openai import AzureOpenAI
import numpy as np
load_dotenv()


True

In [29]:
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)

In [30]:
def generate_embeddings(text, client):
    return client.embeddings.create(input=[text], model=os.getenv("AZURE_ADA3_DEPLOYMENT_NAME")).data[0].embedding


In [31]:
documents = [
    {
        "id": "1",
        "content": 'This is the first dummy chunk of text.',
        "title": "chunk_1.txt",
        "security_groups": ['Owners', 'Members']
    },
    {
        "id": "2",
        "content": 'This is the second dummy chunk of text.',
        "title": "EmperorPenguins.txt",
        "security_groups": ['Owners', 'Members']
    },
    {
        "id": "3",
        "content": 'This is the third dummy chunk of text.',
        "title": "RedPanda.txt",
        "security_groups": ['Owners']
    }
]

In [32]:
for doc in documents:
    content = doc['content']
    content_embeddings = np.array(generate_embeddings(content,client))
    doc['content_embeddings'] = content_embeddings.tolist()

In [33]:
## Fields definition
fields = [
    SimpleField(
        name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=False, facetable=False
    ),
    SearchableField(
        name="content",
        type=SearchFieldDataType.String,
        sortable=False,
        filterable=False,
        facetable=False,
        searchable=True,
        analyzer_name='en.lucene',
    ),
    SearchableField(
        name="title", type=SearchFieldDataType.String, searchable=False, filterable=False
    ),
    SimpleField(
        name = 'security_groups',
        type = "Collection(Edm.String)",
        filterable = True
    ),
    SearchField(
        name="content_embeddings",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=3072, #ada3 dimension space
        vector_search_profile_name="myHnswProfile",
    ),
]

# Vectors definition
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4, ef_construction=400, ef_search=500, metric=VectorSearchAlgorithmMetric.COSINE
            ),
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn", parameters=ExhaustiveKnnParameters(metric=VectorSearchAlgorithmMetric.COSINE)
        ),
    ],
    profiles=[VectorSearchProfile(name="myHnswProfile", algorithm_configuration_name="myHnsw")],
)

# Semantic Reranker definition
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")],
    ),
)
semantic_search = SemanticSearch(configurations=[semantic_config])

In [34]:
# index name is the name of the index you want to create or update
index_name = "dummy-index"
index_client = SearchIndexClient(endpoint=os.getenv('AI_SEARCH_ENDPOINT'), credential=AzureKeyCredential(os.getenv('AI_SEARCH_KEY')))  
search_client = SearchClient(endpoint=os.getenv('AI_SEARCH_ENDPOINT'), index_name=index_name, credential=AzureKeyCredential(os.getenv('AI_SEARCH_KEY')))
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search) 
result = index_client.create_or_update_index(index)  
print(f'{result.name} created')
result = search_client.upload_documents(documents)
#upload might take a while depending on the number and size of documents

dummy-index created


## Fetching Documents from an index in Azure AI Search

In [35]:
def fetch_all_from_index(index_name):
    client = SearchClient(
        index_name=index_name,
        endpoint= os.getenv("AI_SEARCH_ENDPOINT"),
        credential=AzureKeyCredential(os.getenv("AI_SEARCH_KEY")),
    )
    results = client.search(search_text="*")

    return list(results)


In [37]:
index_name = "dummy-index"
_ = fetch_all_from_index(index_name)
_

[{'id': '3',
  'content': 'This is the third dummy chunk of text.',
  'title': 'RedPanda.txt',
  'security_groups': ['Owners'],
  'content_embeddings': [0.003313032,
   -0.016266707,
   -0.021729024,
   0.052561905,
   0.015682686,
   -0.020062845,
   -0.049435675,
   0.05792116,
   -0.007755458,
   0.0137502635,
   0.029269768,
   -0.0153649105,
   0.008047468,
   -0.023498263,
   -0.010890277,
   -0.0012292358,
   0.015098665,
   0.017881354,
   -0.016567307,
   -0.029682018,
   -0.007287382,
   0.00095547584,
   -0.050638072,
   0.07516696,
   0.04108761,
   0.0016114261,
   -0.021797732,
   0.021900794,
   0.004036617,
   0.029991206,
   0.025920235,
   0.015828691,
   -0.009035151,
   -0.04345805,
   0.018293604,
   -0.010967574,
   0.03346098,
   0.026693204,
   0.0035792768,
   -0.017658053,
   0.043595463,
   0.0063855844,
   -0.027053922,
   0.0047365837,
   0.01837949,
   -0.019667773,
   0.010512381,
   0.0042491835,
   -0.017177094,
   0.007995937,
   0.0067634806,
   0.015

## Deleting an index in Azure AI Search

In [26]:
def delete_index(index_name):
    index_client = SearchIndexClient(
        endpoint=os.getenv('AI_SEARCH_ENDPOINT'), credential=AzureKeyCredential(os.getenv('AI_SEARCH_KEY')))
    index_client.delete_index(index_name)
    print(f"{index_name} deleted")


In [27]:
index_name = "dummy-index"
delete_index(index_name)

dummy-index deleted
