## Keys for Authentication

In [69]:
import streamlit as st
st.title("My Streamlit App")
st.write("Hello, world!")

2025-12-11 13:20:34.992 
  command:

    streamlit run c:\Users\soarora\AppData\Local\Programs\Python\Python311\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [53]:
import os
from dotenv import load_dotenv
load_dotenv()

AZURE_SEARCH_SERVICE: str = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_KEY: str = os.getenv("AZURE_SEARCH_KEY")
AZURE_OPENAI_ACCOUNT: str = os.getenv("AZURE_OPENAI_ACCOUNT")
AZURE_OPENAI_KEY: str = os.getenv("AZURE_OPENAI_KEY")
AZURE_AI_MULTISERVICE_ACCOUNT: str = os.getenv("AZURE_AI_MULTISERVICE_ACCOUNT")
AZURE_AI_MULTISERVICE_KEY: str = os.getenv("AZURE_AI_MULTISERVICE_KEY")
AZURE_STORAGE_CONNECTION: str = os.getenv("AZURE_STORAGE_CONNECTION")


In [54]:
print(AZURE_SEARCH_SERVICE)

https://capstoneindex.search.windows.net


## Creating an Index

In [55]:
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)

credential = DefaultAzureCredential()

index_name = "sow-index"
index_client = SearchIndexClient(endpoint= AZURE_SEARCH_SERVICE, credential = credential)

field = [
    SearchField(name="id", type=SearchFieldDataType.String),
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, sortable=True, filterable=True, key=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
    SearchField(
        name="text_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=3072,
        vector_search_profile_name="myHnswProfile"
    )
]

vector_search = VectorSearch(
    algorithms=[HnswAlgorithmConfiguration(name="myHnsw"),
                ],
                profiles=[
                    VectorSearchProfile(
                        name="myHnswProfile",
                        algorithm_configuration_name="myHnsw",
                        vectorizer_name="myOpenAIVectorizer"
                    )
                ],
                vectorizers=[
                    AzureOpenAIVectorizer(
                        vectorizer_name="myOpenAIVectorizer",
                        kind = "azureOpenAI",
                        parameters=AzureOpenAIVectorizerParameters(
                            resource_url=AZURE_OPENAI_ACCOUNT,
                            deployment_name="text-embedding-3-large",
                            model_name="text-embedding-3-large",
                            api_key=AZURE_OPENAI_KEY
                        )
                    )
                ],
            )

# Semantic understanding rather than just keyword matching or vector similarity alone
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field = SemanticField(field_name="title"),
        content_fields = [SemanticField(field_name="chunk")]
    )
)

semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the index
index = SearchIndex(
    name=index_name,
    fields=field,
    vector_search=vector_search,
    semantic_search=semantic_search
)

result = index_client.create_or_update_index(index)
print("Index created:", result.name)


Index created: sow-index


## Connecting Index to Data Source

In [56]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection)

indexer_client = SearchIndexerClient(endpoint= AZURE_SEARCH_SERVICE, credential = credential)

container = SearchIndexerDataContainer(name="sow-container")
data_source_connection = SearchIndexerDataSourceConnection(
    name="sow-datasource",
    type="azureblob",
    connection_string=AZURE_STORAGE_CONNECTION,
    container=container
)

data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)
print("Data source created:", data_source.name)

Data source created: sow-datasource


## Skillset for Chunking and Embeddings Creation

In [61]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    CognitiveServicesAccountKey,
    SearchIndexerSkillset
)

skillset_name = "statement-of-work-skillset"

split_skill = SplitSkill(
    name="split-skill",
    description="Splits skill to chunk documents",
    text_split_mode="pages",
    context="/document",
    inputs = [InputFieldMappingEntry(name="text", source="/document/content")],
    outputs = [OutputFieldMappingEntry(name="textItems", target_name="pages")], # Produces chunked text items called "textItems" and stores them in a field named "pages"
    maximum_page_length=2000, # maximum characters in each chunk/page
    page_overlap_length=500
)

embedding_skill = AzureOpenAIEmbeddingSkill(
    description="Generates embeddings for text chunks",
    name="azure-openai-embedding-skill",
    context="/document/pages/*", # apply to each chunk in the page
    resource_url=AZURE_OPENAI_ACCOUNT,
    deployment_name="text-embedding-3-large",
    model_name="text-embedding-3-large",
    api_key=AZURE_OPENAI_KEY,
    inputs=[InputFieldMappingEntry(name="text", source="/document/pages/*")],
    outputs=[OutputFieldMappingEntry(name="embedding", target_name="text_vector")], 
)

# map enriched data from skillset pipeline to the index

index_projections = SearchIndexerIndexProjection (
    selectors=[
        SearchIndexerIndexProjectionSelector(
            target_index_name=index_name, # which index to write to
            parent_key_field_name="id", # links each chunk to its parent/source document
            source_context="/document/pages/*",
            mappings=[ # how to map data to index fields
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
                InputFieldMappingEntry(name="text_vector", source="/document/pages/*/text_vector"),
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
            ],
        ),
    ],
    parameters=SearchIndexerIndexProjectionsParameters(
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS #only index the chunk, not the entire parent document
    )
)

cognitive_services_account = CognitiveServicesAccountKey(
    key=AZURE_AI_MULTISERVICE_KEY)
skills = [split_skill, embedding_skill]

skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Skillset for processing statement of work documents",
    skills=skills,
    index_projection=index_projections,
    cognitive_services_account=cognitive_services_account)

client = SearchIndexerClient(endpoint= AZURE_SEARCH_SERVICE, credential = credential)
created_skillset = client.create_or_update_skillset(skillset)
print("Skillset created:", created_skillset.name)

Skillset created: statement-of-work-skillset


## Create an Indexer

The indexer is the orchestrator that automates the entire data ingestion pipeline. 

Pulls data -> Chunking + Embeddings -> Write processed data to target index

In [62]:
from azure.search.documents.indexes.models import SearchIndexer

indexer_name = "sow-indexer"
indexer_parameters = None

indexer = SearchIndexer(
    name=indexer_name,
    description="Indexer for statement of work documents",
    skillset_name=skillset_name,
    target_index_name=index_name,
    data_source_name=data_source.name,
    parameters=indexer_parameters
)

indexer_client = SearchIndexerClient(endpoint= AZURE_SEARCH_SERVICE, credential = credential)
created_indexer = indexer_client.create_or_update_indexer(indexer)
print("Indexer created:", created_indexer.name)

Indexer created: sow-indexer


## Check indexer status

In [63]:
status = indexer_client.get_indexer_status(indexer_name)
print("Indexer status:", status.status)

Indexer status: running


In [64]:
# Check detailed indexer execution history
status = indexer_client.get_indexer_status(indexer_name)

print(f"Indexer Status: {status.status}")
print(f"Last Result: {status.last_result.status if status.last_result else 'No runs yet'}")

if status.last_result:
    print(f"\nExecution Summary:")
    print(f"  - Items Processed: {status.last_result.item_count}")
    print(f"  - Items Failed: {status.last_result.failed_item_count}")
    print(f"  - Start Time: {status.last_result.start_time}")
    print(f"  - End Time: {status.last_result.end_time}")
    
    if status.last_result.errors:
        print(f"\n❌ ERRORS ({len(status.last_result.errors)}):")
        for error in status.last_result.errors[:5]:  # Show first 5 errors
            print(f"  - Key: {error.key}")
            print(f"    Error: {error.error_message}")
            print(f"    Details: {error.details if hasattr(error, 'details') else 'N/A'}")
            print()
    
    if status.last_result.warnings:
        print(f"\n⚠️  WARNINGS ({len(status.last_result.warnings)}):")
        for warning in status.last_result.warnings[:5]:
            print(f"  - Key: {warning.key}")
            print(f"    Warning: {warning.message}")
            print()

# Check if indexer needs to run
if status.status == "running":
    print("\n✓ Indexer is currently running. Wait for it to complete.")
elif status.last_result is None:
    print("\n⚠️  Indexer has never run. Running it now...")
    indexer_client.run_indexer(indexer_name)
    print("Indexer started. Check status again in a few moments.")
else:
    print(f"\n✓ Indexer last ran: {status.last_result.end_time}")

Indexer Status: running
Last Result: success

Execution Summary:
  - Items Processed: 1
  - Items Failed: 0
  - Start Time: 2025-12-11 07:02:46.312000+00:00
  - End Time: 2025-12-11 07:02:54.144000+00:00

✓ Indexer is currently running. Wait for it to complete.


## Adding the LLM

In [None]:
from openai import AzureOpenAI
from azure.search.documents import SearchClient # client to interact with the Search Index
from azure.search.documents.models import VectorizableTextQuery

deployment_name = "gpt-4o"

open_ai_client = AzureOpenAI(
    api_version="2025-01-01-preview",
    azure_endpoint=AZURE_OPENAI_ACCOUNT,
    api_key=AZURE_OPENAI_KEY
)

search_client = SearchClient(
    endpoint=AZURE_SEARCH_SERVICE,
    index_name=index_name,
    credential=AzureKeyCredential(AZURE_SEARCH_KEY)
)

# Instructions for the model
GROUNDED_PROMPT="""
You are an AI assistant that helps users learn from the information found in the source material.
Answer the query using only the sources provided below.
Use bullets if the answer has multiple points.
If the answer is longer than 3 sentences, provide a summary.
Answer ONLY with the facts listed in the list of sources below. Cite your source when you answer the question
If there isn't enough information below, say you don't know.
Do not generate answers that don't use the sources below.
Query: {query}
Sources:\n{sources}
"""

query = "Tell me something about the Contoso project deliverables"
vectorized_query = VectorizableTextQuery(text=query, k_nearest_neighbors=5, fields=["text_vector"])

results = search_client.search(
    query_type="semantic",
    semantic_configuration_name="my-semantic-config",
    search_text=query, 
    vector_queries=[vectorized_query], 
    select=["title", "chunk"],  #which fields to return
    top=5)

response = open_ai_client.chat.completions.create(
    model=deployment_name,
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that helps people find information."
        },
        {
            "role": "user",
            "content": GROUNDED_PROMPT.format(
                query=query,
                sources="\n".join([f"- {doc['chunk']} (Source: {doc['title']})" for doc in results])
            )
        }
    ]
)
llm_response = response.choices[0].message.content

print("Answer:", response.choices[0].message.content)

Answer: The Contoso project includes the following key deliverables:

- **Disaster Recovery and Business Continuity Planning**: Includes the planning and execution of measures to ensure disaster recovery and business continuity.
- **Tiered Support Plan**: Development of a plan for tiered support of the application.
- **Adoption Change Management**: A plan for training users on the newly deployed TPA application to maintain productivity.
- **User Acceptance Testing (UAT)**: Includes a plan for UAT execution, as well as preparation of UAT test data and test cases.
- **Security Procedures and Policies**: Sharing of security procedures and policies before the project starts.

Additionally, service deliverables are categorized into:
- **Document Deliverables**: Such as Word, Excel, Visio, or Project files.
- **Functioning Components or Solution Deliverables**: Such as epics and features.

For deliverables to be accepted:
- Contoso will review and approve the service deliverables at specifie

## View Documents in Index

In [52]:
# Get total document count

index_stats = index_client.get_index(index_name)
print(f"Index: {index_name}")
print(f"Total fields: {len(index_stats.fields)}")

# Count documents (approximate)
count_results = search_client.search(search_text="*", include_total_count=True)
print(f"Approximate document count: {count_results.get_count()}")

Index: sow-index
Total fields: 5
Approximate document count: 0
