# Azure AI Search: Vector Search with Embeddings


## Step 1: Install and Import Required Libraries
Install Azure Search SDK, authentication, and HTTP clients for OpenAI embedding generation.

In [None]:
import subprocess, sys
packages = ["azure-search-documents", "azure-identity", "requests"]
for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    SearchIndex, SearchField, SearchFieldDataType, SimpleField, SearchableField, ComplexField,
    VectorSearch, HnswAlgorithmConfiguration, VectorSearchProfile
)
from azure.search.documents.models import VectorizedQuery
from azure.identity import DefaultAzureCredential
from pathlib import Path
import requests
import json

print("âœ“ All packages installed")

## Step 2: Azure Service Configuration
Set up connections to Azure AI Search and Azure OpenAI for embedding generation.

In [None]:
search_endpoint = "https://xxxxxxxxxxxxxxx.search.windows.net"
index_name = "hotels-vector-index"
json_file_path = Path('HotelsData_toAzureBlobs.json')
credential = DefaultAzureCredential()
index_client = SearchIndexClient(endpoint=search_endpoint, credential=credential)

# Foundry configuration (using managed identity)
foundry_endpoint = "https://xxxxxxxxxxxxxxx.cognitiveservices.azure.com/openai/deployments/text-embedding-ada-002/embeddings"
embedding_dimensions = 1536

print(f"âœ“ Azure Search: {search_endpoint}")
print(f"âœ“ Embeddings: ada-002, {embedding_dimensions}D (managed identity)")
print(f"âœ“ Data file: {json_file_path.name}")

## Step 3: Create Vector Index Schema
Define index with vector field (Collection(Edm.Single) with 1536 dimensions) and configure HNSW algorithm for fast similarity search.

In [None]:
fields = [
    SimpleField(name="HotelId", type=SearchFieldDataType.String, key=True, filterable=True),
    SearchableField(name="HotelName", type=SearchFieldDataType.String, sortable=True),
    SearchableField(name="Description", type=SearchFieldDataType.String),
    SearchField(
        name="DescriptionVector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=embedding_dimensions,
        vector_search_profile_name="myHnswProfile"
    ),
    SearchableField(name="Category", type=SearchFieldDataType.String, facetable=True, filterable=True),
    SimpleField(name="Rating", type=SearchFieldDataType.Double, facetable=True, filterable=True, sortable=True),
    ComplexField(name="Address", fields=[
        SearchableField(name="City", type=SearchFieldDataType.String, facetable=True, filterable=True),
    ])
]

vector_search = VectorSearch(
    algorithms=[HnswAlgorithmConfiguration(name="myHnsw")],
    profiles=[VectorSearchProfile(name="myHnswProfile", algorithm_configuration_name="myHnsw")]
)

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f"âœ“ Index '{result.name}' created")

## Step 4: Create Embedding Function
Define function to convert text to vectors using Azure OpenAI text-embedding-ada-002 with managed identity authentication.

In [None]:
def get_foundry_embedding(text):
    # Get managed identity token for Azure Cognitive Services
    token = credential.get_token("https://cognitiveservices.azure.com/.default")
    headers = {
        "Authorization": f"Bearer {token.token}",
        "Content-Type": "application/json"
    }
    payload = {"input": text}

    # Handle Azure OpenAI-style embedding endpoint
    if "api-version=" in foundry_endpoint:
        urls_to_try = [foundry_endpoint]
    else:
        urls_to_try = [
            f"{foundry_endpoint}?api-version=2024-02-15-preview",
            f"{foundry_endpoint}?api-version=2024-02-01",
            f"{foundry_endpoint}?api-version=2023-05-15"
        ]

    last_error = None
    for url in urls_to_try:
        response = requests.post(url, headers=headers, json=payload)
        if response.status_code == 200:
            return response.json()["data"][0]["embedding"]
        last_error = f"{response.status_code}: {response.text[:200]}"

    print(f"Error getting embedding: {last_error}")
    return None

test_vector = get_foundry_embedding("luxury hotel with spa")
if test_vector:
    print(f"âœ“ Test embedding: {len(test_vector)} dimensions")
else:
    print("âœ— Embedding test failed")

## Step 5: Load Hotels Data
Read the HotelsData_toAzureBlobs.json file containing 50 hotels with descriptions, ratings, categories, and locations.

In [None]:
print(f"ðŸ“‚ Loading {json_file_path.name}...\n")

try:
    with open(json_file_path, 'r', encoding='utf-8') as f:
        raw_text = f.read().strip()

    # Source file is comma-separated JSON objects (not wrapped in an array)
    # Convert to valid JSON array and parse
    documents_data = json.loads(f'[{raw_text}]')

    print(f"âœ“ Loaded {len(documents_data)} hotels\n")
    sample = documents_data[0]
    print(f"Sample: {sample['HotelName']} ({sample['Category']})")
    print(f"Rating: {sample['Rating']}â˜… | City: {sample.get('Address', {}).get('City')}")

except Exception as e:
    print(f"âœ— Error: {str(e)}")
    documents_data = []

## Step 6: Generate Embeddings and Upload Documents
Convert each hotel description to a 1536-dimensional vector and upload all documents with their embeddings to the search index.

In [None]:
print(f"Generating embeddings from Foundry...\n")
documents = []

for i, doc in enumerate(documents_data, 1):
    vector = get_foundry_embedding(doc["Description"])
    if vector:
        # Only include fields defined in index schema
        filtered_doc = {
            "HotelId": doc["HotelId"],
            "HotelName": doc["HotelName"],
            "Description": doc["Description"],
            "DescriptionVector": vector,
            "Category": doc["Category"],
            "Rating": doc["Rating"],
            "Address": {"City": doc.get("Address", {}).get("City", "")}
        }
        documents.append(filtered_doc)
        if i % 10 == 0:
            print(f"  âœ“ Embedded {i}/{len(documents_data)}")

search_client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents=documents)
print(f"\nâœ“ {len(documents)} documents indexed with vectors")

## Step 7: Vector Search Queries

### Query 1: Luxury & Spa

In [None]:
print("=== QUERY 1: Luxury Spa Relaxation ===")
query_text = "luxury spa relaxation wellness"
query_vector = get_foundry_embedding(query_text)

results = list(search_client.search(
    vector_queries=[VectorizedQuery(vector=query_vector, k=5, fields="DescriptionVector")]
))

print(f"Found {len(results)} results:\n")
for i, r in enumerate(results[:5], 1):
    print(f"{i}. {r['HotelName']} | {r['Rating']}â˜… | {r.get('Category', 'N/A')}")
    print(f"   Score: {r['@search.score']:.4f}\n")

### Query 2: Adventure & Outdoor

In [None]:
print("\n=== QUERY 2: Adventure & Outdoor ===")
query_text = "hiking climbing camping mountain nature"
query_vector = get_foundry_embedding(query_text)

results = list(search_client.search(
    vector_queries=[VectorizedQuery(vector=query_vector, k=5, fields="DescriptionVector")]
))

print(f"Found {len(results)} results:\n")
for i, r in enumerate(results[:5], 1):
    print(f"{i}. {r['HotelName']} | {r['Rating']}â˜… | {r.get('Category', 'N/A')}")
    print(f"   Score: {r['@search.score']:.4f}\n")

### Query 3: Beach & Water

In [None]:
print("\n=== QUERY 3: Beach & Water Sports ===")
query_text = "beach ocean water swimming tropical"
query_vector = get_foundry_embedding(query_text)

results = list(search_client.search(
    vector_queries=[VectorizedQuery(vector=query_vector, k=5, fields="DescriptionVector")]
))

print(f"Found {len(results)} results:\n")
for i, r in enumerate(results[:5], 1):
    print(f"{i}. {r['HotelName']} | {r['Rating']}â˜… | {r.get('Category', 'N/A')}")
    print(f"   Score: {r['@search.score']:.4f}\n")

### Query 4: Business & Professional

In [None]:
print("\n=== QUERY 4: Business & Conference ===")
query_text = "business meeting conference professional network"
query_vector = get_foundry_embedding(query_text)

results = list(search_client.search(
    vector_queries=[VectorizedQuery(vector=query_vector, k=5, fields="DescriptionVector")]
))

print(f"Found {len(results)} results:\n")
for i, r in enumerate(results[:5], 1):
    print(f"{i}. {r['HotelName']} | {r['Rating']}â˜… | {r.get('Category', 'N/A')}")
    print(f"   Score: {r['@search.score']:.4f}\n")

In [None]:
print("\n=== QUERY 5: outdoor adventure and nature activities ===\n")

query_text = "outdoor adventure and nature activities"
query_vector = get_foundry_embedding(query_text)

vector_results = list(search_client.search(
    vector_queries=[VectorizedQuery(vector=query_vector, k=5, fields="DescriptionVector")],
    select=["HotelName", "Category", "Rating", "Description"],
    top=5
))

print(f"Vector results: {len(vector_results)}\n")
for i, r in enumerate(vector_results, 1):
    print(f"{i}. {r['HotelName']} | {r.get('Category', 'N/A')} | {r.get('Rating', 'N/A')}â˜…")
    print(f"   Score: {r['@search.score']:.4f}\n")

### Query 5: Semantic Understanding (Works Here, Fails in Keyword Search)

In [None]:
print("\n=== Keyword vs Vector (Same Query, Same Data) ===\n")

query_text = "outdoor adventure and nature activities"

# Keyword-style search on same index/data
keyword_results = list(search_client.search(
    search_text=query_text,
    select=["HotelName", "Category", "Rating"],
    top=5
))

# Vector search on same index/data
query_vector = get_foundry_embedding(query_text)
vector_results = list(search_client.search(
    vector_queries=[VectorizedQuery(vector=query_vector, k=5, fields="DescriptionVector")],
    select=["HotelName", "Category", "Rating"],
    top=5
))

print(f"Query: {query_text}\n")
print(f"Keyword returned: {len(keyword_results)}")
print(f"Vector returned:  {len(vector_results)}\n")

print("Top 3 Keyword Results:")
for i, r in enumerate(keyword_results[:3], 1):
    print(f"{i}. {r['HotelName']} | Score: {r['@search.score']:.4f}")

print("\nTop 3 Vector Results:")
for i, r in enumerate(vector_results[:3], 1):
    print(f"{i}. {r['HotelName']} | Score: {r['@search.score']:.4f}")

print("\nWhy vector is better for this query:")
print("- Keyword needs exact term overlap")
print("- Vector matches semantic intent (adventure/nature/outdoor concepts)")