# Azure AI Search integrated vectorization pipeline

In [2]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

In [3]:
# config

load_dotenv(override=True)

AZURE_AI_SEARCH_ENDPOINT = os.environ["AZURE_AI_SEARCH_ENDPOINT"]
AZURE_AI_SEARCH_API_KEY = os.environ["AZURE_AI_SEARCH_API_KEY"]
AZURE_STORAGE_ACC_CONNECTION_STRING = os.environ["AZURE_STORAGE_ACC_CONNECTION_STRING"]
BLOB_CONTAINER_NAME = os.environ["BLOB_CONTAINER_NAME"]

AZURE_OPENAI_API_KEY = os.environ["AZURE_OPENAI_API_KEY"]
AZURE_OPENAI_ENDPOINT = os.environ["AZURE_OPENAI_ENDPOINT"]
AZURE_OPENAI_VERSION = os.environ["AZURE_OPENAI_VERSION"]
AZURE_OPENAI_DEPLOYMENT_NAME = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME"]

DOC_INTELLIGENCE_ENDPOINT = os.environ["DOC_INTELLIGENCE_ENDPOINT"]
DOC_INTELLIGENCE_KEY = os.environ["DOC_INTELLIGENCE_KEY"]

SEARCH_TARGET_INDEX_NAME = os.getenv("AZURE_AI_FISHING_INDEX")

## Connect to Blob Storage and load documents

In [5]:
from azure.storage.blob import BlobServiceClient  
import os

# connect to blob storage
blob_service_client = BlobServiceClient.from_connection_string(AZURE_STORAGE_ACC_CONNECTION_STRING)
container_client = blob_service_client.get_container_client(BLOB_CONTAINER_NAME)

# create if doesn't exist
if not container_client.exists():
    container_client.create_container()

for blob in container_client.list_blob_names():
    print(blob)

fishingguide1.pdf
fishingguide2.pdf


## Create a blob data source connector on Azure AI Search

In [8]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)

# Create a data source 
indexer_client = SearchIndexerClient(AZURE_AI_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))
container = SearchIndexerDataContainer(name=BLOB_CONTAINER_NAME)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{BLOB_CONTAINER_NAME}-blob",
    type="azureblob",
    connection_string=AZURE_STORAGE_ACC_CONNECTION_STRING,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'data-blob' created or updated


## Create a search index

In [9]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# Create a search index  
index_client = SearchIndexClient(endpoint=AZURE_AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))  
fields = [  
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(name="title", type=SearchFieldDataType.String),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm_configuration_name="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=AZURE_OPENAI_ENDPOINT,  
                deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME,  
                api_key=AZURE_OPENAI_API_KEY,  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="chunk")]  
    ),  
)  
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index
index = SearchIndex(name=SEARCH_TARGET_INDEX_NAME, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"INDEX SUCCESSFULLY CREATED: {result.name}")

fishing-index-d created


## Create a skillset

In [10]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset
)

# Create a skillset  
skillset_name = f"{SEARCH_TARGET_INDEX_NAME}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=AZURE_OPENAI_ENDPOINT,  
    deployment_id=AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME,  
    api_key=AZURE_OPENAI_API_KEY,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=SEARCH_TARGET_INDEX_NAME,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(endpoint=AZURE_AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))  
client.create_or_update_skillset(skillset)  
print(f"SKILLSET SUCCESSFULLY CREATED: {skillset.name}") 


SKILLSET SUCCESSFULLY CREATED: fishing-index-d-skillset


## Create an indexer

In [11]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer  
indexer_name = f"{SEARCH_TARGET_INDEX_NAME}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=SEARCH_TARGET_INDEX_NAME,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]  
)  
  
indexer_client = SearchIndexerClient(endpoint=AZURE_AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')  


 fishing-index-d-indexer is created and running. If queries return no results, please wait a bit and try again.


## Perform a vector similarity search

In [12]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"  
  
search_client = SearchClient(AZURE_AI_SEARCH_ENDPOINT, SEARCH_TARGET_INDEX_NAME, credential=AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   


parent_id: aHR0cHM6Ly9zdG9yYWdlYWNjb3VudHZsYWR5ODE0MS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGF0YS9maXNoaW5nZ3VpZGUyLnBkZg2
chunk_id: c8d41191c3e1_aHR0cHM6Ly9zdG9yYWdlYWNjb3VudHZsYWR5ODE0MS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGF0YS9maXNoaW5nZ3VpZGUyLnBkZg2_pages_2
Score: 0.7718277
Content: Regulations ...........................................................................  42

Invasive Species .................................................................................  45

More Information ...............................................................................  46

Map of State Parks That Have Fishing.........................................  48



2

Learn to Fish
BASIC FISHING TACKLE

Hooks 

Hooks come in an assortment of sizes and styles. Circle hooks are 
great for beginners and safe for fish. If you plan to release your catch, 
use barbless hooks or bend down the barb to make it easier to remove 
the hook. Choose the size of hook for the species of fish you are trying 
to catc

## Perform a hybrid search

In [14]:
# Hybrid Search
query = "What is the population of Norway?"  
  
search_client = SearchClient(AZURE_AI_SEARCH_ENDPOINT, SEARCH_TARGET_INDEX_NAME, credential=AzureKeyCredential(AZURE_AI_SEARCH_API_KEY))
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=2
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")  


parent_id: aHR0cHM6Ly9zdG9yYWdlYWNjb3VudHZsYWR5ODE0MS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGF0YS9maXNoaW5nZ3VpZGU0LnBkZg2
chunk_id: b748f0d7f7e0_aHR0cHM6Ly9zdG9yYWdlYWNjb3VudHZsYWR5ODE0MS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGF0YS9maXNoaW5nZ3VpZGU0LnBkZg2_pages_12
Score: 0.03055555745959282
Content: vessels and processing plants are largely privately owned and run, with most 
services and infrastructure being public.

Supply and demand

Norwegians have an annual supply of 54.7 kg of fish and fish products per caput. Over the 
last decade there has been an increase in consumption by 30 to 50 year olds, while other 
age groups have started to eat less fish and fish any products. There is a long tradition of 
fish consumption in Norway, with about 30 per cent originating from household 
recreational fishing in the coastal zone.

Fish accounts for approximately 16 per cent of the average daily protein intake of 
Norwegians (17.1 g per day per capita). This is less than provided by meat (20.7 g per day 
