In [None]:
from datetime import timedelta
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataSourceConnection,
    SearchIndexerDataContainer,
    SearchIndexerSkillset,
    WebApiSkill,
    AzureOpenAIEmbeddingSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    SearchIndexer,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    IndexingParameters,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerIndexProjectionSelector,
    IndexingSchedule,
)

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

search_endpoint = os.getenv("SEARCH_ENDPOINT")
search_client_key = os.getenv("SEARCH_CLIENT_KEY")
storage_connection_string = os.getenv("STORAGE_CONNECTION_STRING")
blob_container_name = os.getenv("BLOB_CONTAINER_NAME")
azure_function_url = os.getenv("AZURE_FUNCTION_URL")
azure_openai_resource_url = os.getenv("AZURE_OPENAI_RESOURCE_URL")
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
azure_openai_embedding_deployment_name  = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
azure_openai_embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 3072))

In [12]:
index_name = "video-index"
data_source_name = "video-blob-ds"
skillset_name = "video-processing-skillset"
indexer_name = "video-indexer"

In [None]:
index_client = SearchIndexClient(
    endpoint=search_endpoint,
    credential=AzureKeyCredential(search_client_key)
)

indexer_client = SearchIndexerClient(
    endpoint=search_endpoint,
    credential=AzureKeyCredential(search_client_key)
)

### Create index

In [14]:
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String, filterable=True, sortable=True),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, filterable=True, analyzer_name="keyword"),
    SearchField(name="id", type=SearchFieldDataType.String, searchable=True, sortable=True, filterable=True),
    SearchField(name="video_name", type=SearchFieldDataType.String, searchable=True, filterable=True),
    SearchField(name="text", type=SearchFieldDataType.String, searchable=True),
    SearchField(name="start_time", type=SearchFieldDataType.String, filterable=True, sortable=True),
    SearchField(name="end_time", type=SearchFieldDataType.String, filterable=True, sortable=True),
    SearchField(
        name="vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=azure_openai_embedding_dimensions,
        vector_search_profile_name="vector-profile"
    )
]
vector_search = VectorSearch(
    profiles=[
        VectorSearchProfile(name="vector-profile", algorithm_configuration_name="hnsw-config")
    ],
    algorithms=[
        HnswAlgorithmConfiguration(name="hnsw-config")
    ],
    vectorizers=[
        AzureOpenAIVectorizer(
            vectorizer_name="azure-openai-vectorizer",
            parameters=AzureOpenAIVectorizerParameters(
                resource_url=azure_openai_resource_url,
                deployment_name=azure_openai_embedding_deployment_name,
                model_name=azure_openai_embedding_model_name,
                api_key=azure_openai_api_key,
            )
        )
    ]
)

semantic_config = SemanticConfiguration(  
    name="semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="text")]  
    )
)

semantic_search = SemanticSearch(
    default_configuration_name="semantic-config",
    configurations=[semantic_config]
)

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
index_client.create_or_update_index(index)
print("✅ Index with vector field created")

✅ Index with vector field created


### Create data source

In [15]:
blob_container = SearchIndexerDataContainer(name=blob_container_name)

data_source = SearchIndexerDataSourceConnection(
    name=data_source_name,
    type="azureblob",
    connection_string=storage_connection_string,
    container=blob_container,
)
indexer_client.create_or_update_data_source_connection(data_source)
print("✅ Data source created")

✅ Data source created


### Create skillset

In [16]:
web_skill = WebApiSkill(
    name="video-processing-skill",
    description="Extract video metadata via Azure Function",
    context="/document",
    uri=azure_function_url,
    batch_size=1,
    degree_of_parallelism=1,
    timeout=timedelta(seconds=230),
    inputs=[
        InputFieldMappingEntry(name="name", source="/document/metadata_storage_name"),
        InputFieldMappingEntry(name="blob_url", source="/document/metadata_storage_path"),
        InputFieldMappingEntry(name="sas_token", source="/document/metadata_storage_sas_token"),
        InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"),
    ],
    outputs=[
        OutputFieldMappingEntry(name="segments", target_name="segments"),
    ]
)

embeddings_skill = AzureOpenAIEmbeddingSkill(
    name="text-embedding-skill",
    description="Generate embeddings for text via Azure OpenAI",
    context="/document/segments/*",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/segments/*/text")
    ],
    outputs=[
        OutputFieldMappingEntry(name="embedding", target_name="vector")
    ],
    resource_url=azure_openai_resource_url,
    api_key=azure_openai_api_key,
    deployment_name=azure_openai_embedding_deployment_name,
    model_name=azure_openai_embedding_model_name,
    dimensions=azure_openai_embedding_dimensions,
)

index_projection = SearchIndexerIndexProjection(
    selectors=[
        SearchIndexerIndexProjectionSelector(
            target_index_name="video-index",
            parent_key_field_name="parent_id",
            source_context="/document/segments/*",
            mappings=[
                InputFieldMappingEntry(name="start_time", source="/document/segments/*/start_time"),
                InputFieldMappingEntry(name="id", source="/document/segments/*/id"),
                InputFieldMappingEntry(name="end_time", source="/document/segments/*/end_time"),
                InputFieldMappingEntry(name="text", source="/document/segments/*/text"),
                InputFieldMappingEntry(name="video_name", source="/document/segments/*/video_name"),
                InputFieldMappingEntry(name="vector", source="/document/segments/*/vector"),
            ]
        )
    ],
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),
)

skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Enrich video blobs and generate text embeddings",
    skills=[web_skill, embeddings_skill],
    index_projection=index_projection,
)

indexer_client.create_or_update_skillset(skillset)
print("✅ Skillset created")

✅ Skillset created


### Create indexer

In [17]:
indexing_params = IndexingParameters(
    configuration={
        "indexed_file_name_extensions": ".mp4,.mov,.mp3,.wav",
    },
)           

indexer = SearchIndexer(
    name=indexer_name,
    data_source_name=data_source_name,
    target_index_name=index_name,
    skillset_name=skillset_name,
    parameters=indexing_params,
    schedule=IndexingSchedule(
        interval=timedelta(minutes=5),
    )
)
indexer_client.create_or_update_indexer(indexer)
print("✅ Indexer created and started")

✅ Indexer created and started
