# Azure AI Search with Storage Account Data

This notebook demonstrates how to:
1. Connect to Azure Storage Account
2. Load data from storage instead of GitHub
3. Create and populate Azure AI Search index
4. Set up knowledge agent for retrieval

In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.storage.blob import BlobServiceClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, SearchField, VectorSearch, VectorSearchProfile, 
    HnswAlgorithmConfiguration, AzureOpenAIVectorizer, AzureOpenAIVectorizerParameters,
    SemanticSearch, SemanticConfiguration, SemanticPrioritizedFields, SemanticField,
    KnowledgeAgent, KnowledgeAgentAzureOpenAIModel, KnowledgeAgentTargetIndex, KnowledgeAgentRequestLimits
)
from azure.search.documents import SearchIndexingBufferedSender
import json
import os
import requests

In [2]:
# Load environment variables
load_dotenv(override=True)

# Azure Storage Configuration
storage_account_name = os.getenv("AZURE_STORAGE_ACCOUNT_NAME", "your-storage-account")
storage_account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")  # Optional: can use DefaultAzureCredential instead
storage_container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME", "documents")
storage_blob_name = os.getenv("AZURE_STORAGE_BLOB_NAME", "nasa-documents.json")  # Your JSON file in storage

# Azure Search Configuration
search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
search_api_key = os.getenv("AZURE_OPENAI_KEY")

# Use API key if available, otherwise use managed identity
if search_api_key:
    print("🔑 Using Azure Search API Key authentication")
    search_credential = AzureKeyCredential(search_api_key)
else:
    print("🔐 Using Managed Identity authentication")
    managed_identity_client_id = os.getenv("MANAGED_IDENTITY_CLIENT_ID")
    search_credential = DefaultAzureCredential(managed_identity_client_id=managed_identity_client_id)

# Azure OpenAI Configuration
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_gpt_deployment = os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT", "gpt-4o")
azure_openai_gpt_model = os.getenv("AZURE_OPENAI_GPT_MODEL", "gpt-4o")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_embedding_model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-large")

# General Configuration
index_name = os.getenv("AZURE_SEARCH_INDEX", "earth_at_night_storage")
agent_name = os.getenv("AZURE_SEARCH_AGENT_NAME", "earth-search-agent-storage")

print("✅ Configuration loaded successfully")
print(f"Storage Account: {storage_account_name}")
print(f"Container: {storage_container_name}")
print(f"Blob: {storage_blob_name}")
print(f"Search Index: {index_name}")
print(f"Agent Name: {agent_name}")

🔑 Using Azure Search API Key authentication
✅ Configuration loaded successfully
Storage Account: storageagenticaidemo
Container: demo
Blob: documents.json
Search Index: earth_at_night_storage
Agent Name: earth-search-agent-storage


In [3]:
# Method 1: Connect to Azure Storage using Account Key
def connect_with_account_key():
    if storage_account_key:
        connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account_name};AccountKey={storage_account_key};EndpointSuffix=core.windows.net"
        return BlobServiceClient.from_connection_string(connection_string)
    else:
        raise ValueError("AZURE_STORAGE_ACCOUNT_KEY not found in environment variables")

# Method 2: Connect to Azure Storage using DefaultAzureCredential (Recommended)
def connect_with_managed_identity():
    account_url = f"https://{storage_account_name}.blob.core.windows.net"
    credential = DefaultAzureCredential()
    return BlobServiceClient(account_url=account_url, credential=credential)

# Choose connection method
try:
    if storage_account_key:
        print("🔑 Connecting with Account Key...")
        blob_service_client = connect_with_account_key()
    else:
        print("🔐 Connecting with Managed Identity...")
        blob_service_client = connect_with_managed_identity()
    
    print("✅ Connected to Azure Storage successfully")
    
    # Test connection by listing containers
    try:
        containers = list(blob_service_client.list_containers())
        print(f"📦 Found {len(containers)} containers:")
        for container in containers[:5]:  # Show first 5
            print(f"   - {container.name}")
    except Exception as e:
        print(f"⚠️  Could not list containers: {e}")
        
except Exception as e:
    print(f"❌ Failed to connect to storage: {e}")
    print("\n💡 Make sure to set these environment variables:")
    print("   - AZURE_STORAGE_ACCOUNT_NAME")
    print("   - AZURE_STORAGE_ACCOUNT_KEY (or use managed identity)")
    print("   - AZURE_STORAGE_CONTAINER_NAME")
    print("   - AZURE_STORAGE_BLOB_NAME")

🔑 Connecting with Account Key...
✅ Connected to Azure Storage successfully
📦 Found 1 containers:
   - demo
📦 Found 1 containers:
   - demo


In [4]:
# Load data from Azure Storage
def load_data_from_storage():
    try:
        print(f"📥 Loading data from storage: {storage_container_name}/{storage_blob_name}")
        
        # Get blob client
        blob_client = blob_service_client.get_blob_client(
            container=storage_container_name, 
            blob=storage_blob_name
        )
        
        # Download blob content
        blob_data = blob_client.download_blob()
        content = blob_data.readall()
        
        # Parse JSON
        documents = json.loads(content)
        
        print(f"✅ Successfully loaded {len(documents)} documents from storage")
        return documents
        
    except Exception as e:
        print(f"❌ Failed to load data from storage: {e}")
        print("\n🔄 Falling back to GitHub URL as backup...")
        return load_data_from_github_backup()

def load_data_from_github_backup():
    """Backup method: Load from GitHub if storage fails"""
    try:
        url = "https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/refs/heads/main/nasa-e-book/earth-at-night-json/documents.json"
        response = requests.get(url)
        response.raise_for_status()
        documents = response.json()
        print(f"✅ Backup: Loaded {len(documents)} documents from GitHub")
        return documents
    except Exception as e:
        print(f"❌ Backup also failed: {e}")
        return []

# Load the documents
documents = load_data_from_storage()

if documents:
    print(f"\n📊 Data Summary:")
    print(f"   Total documents: {len(documents)}")
    if len(documents) > 0:
        print(f"   Sample document keys: {list(documents[0].keys())}")
        print(f"   First document preview: {str(documents[0])[:200]}...")
else:
    print("❌ No documents loaded. Please check your storage configuration.")

📥 Loading data from storage: demo/documents.json
✅ Successfully loaded 194 documents from storage

📊 Data Summary:
   Total documents: 194
   Sample document keys: ['id', 'page_chunk', 'page_embedding_text_3_large', 'page_number']
   First document preview: {'id': 'earth_at_night_508_page_100_verbalized', 'page_chunk': '# Human Light Sources\n\n## Figure: Human Activity at Night as Seen from Space\n\n**[Verbalized Content of the Figure]**\n\nThe figure s...
✅ Successfully loaded 194 documents from storage

📊 Data Summary:
   Total documents: 194
   Sample document keys: ['id', 'page_chunk', 'page_embedding_text_3_large', 'page_number']
   First document preview: {'id': 'earth_at_night_508_page_100_verbalized', 'page_chunk': '# Human Light Sources\n\n## Figure: Human Activity at Night as Seen from Space\n\n**[Verbalized Content of the Figure]**\n\nThe figure s...


In [7]:
# Create Azure AI Search Index
def create_search_index():
    try:
        print(f"🔍 Creating search index: {index_name}")
        
        index = SearchIndex(
            name=index_name,
            fields=[
                SearchField(name="id", type="Edm.String", key=True, filterable=True, sortable=True, facetable=True),
                SearchField(name="page_chunk", type="Edm.String", filterable=False, sortable=False, facetable=False),
                SearchField(name="page_embedding_text_3_large", type="Collection(Edm.Single)", stored=False, 
                           vector_search_dimensions=3072, vector_search_profile_name="hnsw_text_3_large"),
                SearchField(name="page_number", type="Edm.Int32", filterable=True, sortable=True, facetable=True)
            ],
            vector_search=VectorSearch(
                profiles=[
                    VectorSearchProfile(
                        name="hnsw_text_3_large", 
                        algorithm_configuration_name="alg", 
                        vectorizer_name="azure_openai_text_3_large"
                    )
                ],
                algorithms=[HnswAlgorithmConfiguration(name="alg")],
                vectorizers=[
                    AzureOpenAIVectorizer(
                        vectorizer_name="azure_openai_text_3_large",
                        parameters=AzureOpenAIVectorizerParameters(
                            resource_url=azure_openai_endpoint,
                            deployment_name=azure_openai_embedding_deployment,
                            model_name=azure_openai_embedding_model
                        )
                    )
                ]
            ),
            semantic_search=SemanticSearch(
                default_configuration_name="semantic_config",
                configurations=[
                    SemanticConfiguration(
                        name="semantic_config",
                        prioritized_fields=SemanticPrioritizedFields(
                            content_fields=[
                                SemanticField(field_name="page_chunk")
                            ]
                        )
                    )
                ]
            )
        )

        index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
        result = index_client.create_or_update_index(index)
        print(f"✅ Index '{index_name}' created or updated successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to create index: {e}")
        return False



In [8]:
# Create the index
index_created = create_search_index()

🔍 Creating search index: earth_at_night_storage
✅ Index 'earth_at_night_storage' created or updated successfully
✅ Index 'earth_at_night_storage' created or updated successfully


In [6]:
# Upload documents to the search index
def upload_documents_to_index():
    if not documents:
        print("❌ No documents to upload")
        return False
        
    if not index_created:
        print("❌ Index not created, cannot upload documents")
        return False
    
    try:
        print(f"📤 Uploading {len(documents)} documents to index...")
        
        with SearchIndexingBufferedSender(
            endpoint=search_endpoint, 
            index_name=index_name, 
            credential=search_credential
        ) as client:
            client.upload_documents(documents=documents)
        
        print(f"✅ Documents uploaded to index '{index_name}' successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to upload documents: {e}")
        return False

# Upload the documents
documents_uploaded = upload_documents_to_index()

📤 Uploading 194 documents to index...
✅ Documents uploaded to index 'earth_at_night_storage' successfully
✅ Documents uploaded to index 'earth_at_night_storage' successfully


In [None]:
# Create Knowledge Agent
def create_knowledge_agent():
    if not documents_uploaded:
        print("❌ Documents not uploaded, cannot create agent")
        return False
    
    try:
        print(f"🤖 Creating knowledge agent: {agent_name}")
        
        agent = KnowledgeAgent(
            name=agent_name,
            models=[
                KnowledgeAgentAzureOpenAIModel(
                    azure_open_ai_parameters=AzureOpenAIVectorizerParameters(
                        resource_url=azure_openai_endpoint,
                        deployment_name=azure_openai_gpt_deployment,
                        model_name=azure_openai_gpt_model
                    )
                )
            ],
            target_indexes=[
                KnowledgeAgentTargetIndex(
                    index_name=index_name,
                    default_reranker_threshold=2.5
                )
            ],
            request_limits=KnowledgeAgentRequestLimits(
                max_output_size=10000
            )
        )

        index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
        result = index_client.create_or_update_agent(agent)
        print(f"✅ Knowledge agent '{agent_name}' created or updated successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to create knowledge agent: {e}")
        return False

# Create the agent
agent_created = create_knowledge_agent()

In [None]:
# Test the setup with a simple search
def test_search_index():
    try:
        from azure.search.documents import SearchClient
        
        print("🔍 Testing search functionality...")
        
        search_client = SearchClient(
            endpoint=search_endpoint,
            index_name=index_name,
            credential=search_credential
        )
        
        # Simple search test
        results = search_client.search("earth night", top=3)
        
        print(f"\n🎯 Search Results for 'earth night':")
        for i, result in enumerate(results, 1):
            print(f"\nResult {i}:")
            print(f"   ID: {result.get('id', 'N/A')}")
            print(f"   Page: {result.get('page_number', 'N/A')}")
            print(f"   Content: {result.get('page_chunk', 'N/A')[:100]}...")
            print(f"   Score: {result.get('@search.score', 'N/A')}")
        
        return True
        
    except Exception as e:
        print(f"❌ Search test failed: {e}")
        return False

# Test the search
if documents_uploaded:
    test_search_index()
else:
    print("⏩ Skipping search test - documents not uploaded")

## 🎯 Summary

This notebook demonstrates how to:

1. **Connect to Azure Storage** using either account key or managed identity
2. **Load JSON data** from a blob in your storage account
3. **Create Azure AI Search index** with vector and semantic search capabilities
4. **Upload documents** from storage to the search index
5. **Create a knowledge agent** for intelligent retrieval
6. **Test the search functionality**

## 📝 Required Environment Variables

Add these to your `.env` file:

```env
# Azure Storage
AZURE_STORAGE_ACCOUNT_NAME=your-storage-account
AZURE_STORAGE_ACCOUNT_KEY=your-storage-key  # Optional if using managed identity
AZURE_STORAGE_CONTAINER_NAME=documents
AZURE_STORAGE_BLOB_NAME=nasa-documents.json

# Azure Search (existing)
AZURE_SEARCH_ENDPOINT=your-search-endpoint
AZURE_SEARCH_API_KEY=your-search-key

# Azure OpenAI (existing)
AZURE_OPENAI_ENDPOINT=your-openai-endpoint
AZURE_OPENAI_GPT_DEPLOYMENT=gpt-4o
AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-large
```

## 🚀 Next Steps

1. Upload your JSON data file to Azure Storage
2. Update the environment variables
3. Run this notebook to create your search solution
4. Test with your own data!

In [None]:
# Utility: Upload sample data to storage (optional)
def upload_sample_data_to_storage():
    """Utility function to upload the NASA sample data to your storage account"""
    try:
        print("📤 Downloading sample data from GitHub...")
        
        # Download from GitHub
        url = "https://raw.githubusercontent.com/Azure-Samples/azure-search-sample-data/refs/heads/main/nasa-e-book/earth-at-night-json/documents.json"
        response = requests.get(url)
        response.raise_for_status()
        
        print("📤 Uploading to your storage account...")
        
        # Upload to storage
        blob_client = blob_service_client.get_blob_client(
            container=storage_container_name,
            blob=storage_blob_name
        )
        
        blob_client.upload_blob(response.content, overwrite=True)
        
        print(f"✅ Sample data uploaded to {storage_account_name}/{storage_container_name}/{storage_blob_name}")
        return True
        
    except Exception as e:
        print(f"❌ Failed to upload sample data: {e}")
        return False

# Uncomment the line below to upload sample data to your storage
upload_sample_data_to_storage()