# Agentic retrieval using OneLake (Microsoft Fabric) Knowledge Source

Use this notebook to create an agentic retrieval pipeline from documents stored in Microsoft Fabric OneLake.

In this walkthrough, you will:

+ Upload documents to OneLake
+ Create a knowledge source from indexed OneLake with automatic ingestion
+ Monitor ingestion progress
+ Create a knowledge base and agent
+ Query OneLake documents

**Note**: OneLake sources support automatic re-indexing and image processing.

**Important**: OneLake knowledge sources currently have limited Python SDK support. This notebook uses a hybrid approach with SDK where available and REST API where needed.

## Prerequisites

+ Azure AI Search, basic tier or higher.
+ Microsoft Fabric capacity or trial.
+ Fabric workspace and Lakehouse with documents.
+ Azure OpenAI and Azure AI Foundry project.
+ Deployments of gpt-4o-mini and text-embedding-3-large in your Foundry project.

We recommend creating a virtual environment to run this sample code.

## Install required packages

In [None]:
%pip install azure-search-documents==11.7.0b2
%pip install azure-identity
%pip install azure-ai-projects
%pip install azure-mgmt-cognitiveservices
%pip install python-dotenv
%pip install requests

## Load connections

In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.mgmt.core.tools import parse_resource_id
import os
import requests
import json

load_dotenv(override=True)

project_endpoint = os.environ["PROJECT_ENDPOINT"]
project_resource_id = os.environ["PROJECT_RESOURCE_ID"]
project_connection_name = os.getenv("PROJECT_CONNECTION_NAME", "onelakeconnection")
agent_model = os.getenv("AGENT_MODEL", "gpt-4o-mini")
agent_name = os.getenv("AGENT_NAME", "onelake-docs-agent")
endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
search_api_key = os.environ["AZURE_SEARCH_API_KEY"]
credential = DefaultAzureCredential()
knowledge_source_name = os.getenv("AZURE_SEARCH_KNOWLEDGE_SOURCE_NAME", "onelake-docs-source")
base_name = os.getenv("AZURE_SEARCH_AGENT_NAME", "onelake-docs-base")
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_gpt_deployment = os.getenv("AZURE_OPENAI_GPT_DEPLOYMENT", "gpt-4o-mini")
azure_openai_gpt_model = os.getenv("AZURE_OPENAI_GPT_MODEL", "gpt-4o-mini")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-large")
azure_openai_embedding_model = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-3-large")
fabric_workspace_id = os.environ["FABRIC_WORKSPACE_ID"]
lakehouse_item_id = os.environ["LAKEHOUSE_ITEM_ID"]
target_path = os.getenv("TARGET_PATH", "/Files/documents")
api_version = "2025-11-01-preview"

parsed_resource_id = parse_resource_id(project_resource_id)
subscription_id = parsed_resource_id['subscription']
resource_group = parsed_resource_id['resource_group']
account_name = parsed_resource_id['name']
project_name = parsed_resource_id['child_name_1']

## Create indexed OneLake knowledge source

Create a knowledge source that ingests OneLake documents.

**Note**: OneLake paths always start with `/Files/`

In [None]:
url = f"{endpoint}/knowledgeSources/{knowledge_source_name}?api-version={api_version}"

headers = {
    "api-key": search_api_key,
    "Content-Type": "application/json"
}

body = {
    "name": knowledge_source_name,
    "kind": "indexedOneLake",
    "description": "Knowledge source from Microsoft Fabric OneLake",
    "indexedOneLakeParameters": {
        "fabricWorkspaceId": fabric_workspace_id,
        "lakehouseId": lakehouse_item_id,
        "targetPath": target_path,
        "ingestionParameters": {
            "identity": None,
            "embeddingModel": {
                "kind": "azureOpenAI",
                "azureOpenAIParameters": {
                    "resourceUri": azure_openai_endpoint,
                    "deploymentId": azure_openai_embedding_deployment,
                    "modelName": azure_openai_embedding_model
                }
            },
            "chatCompletionModel": {
                "kind": "azureOpenAI",
                "azureOpenAIParameters": {
                    "resourceUri": azure_openai_endpoint,
                    "deploymentId": azure_openai_gpt_deployment,
                    "modelName": azure_openai_gpt_model
                }
            },
            "disableImageVerbalization": False,
            "contentExtractionMode": "minimal"
        }
    }
}

response = requests.put(url, headers=headers, json=body)
print(f"Status: {response.status_code}")
if response.status_code in [200, 201]:
    print(f"Knowledge source '{knowledge_source_name}' created or updated successfully.")
else:
    print(f"Error: {response.text}")

## Monitor ingestion progress

In [None]:
import time

status_url = f"{endpoint}/knowledgeSources/{knowledge_source_name}/status?api-version={api_version}"

print("Waiting for ingestion to complete...")
while True:
    response = requests.get(status_url, headers=headers)
    status = response.json()
    
    current_status = status.get("status", "unknown")
    print(f"Status: {current_status}")
    
    if current_status == "succeeded":
        print("\nIngestion completed successfully!")
        break
    elif current_status == "failed":
        print("\nIngestion failed!")
        print(f"Error: {status}")
        break
    
    time.sleep(15)

## Create a knowledge base

In [None]:
from azure.search.documents.indexes.models import (
    KnowledgeBase,
    KnowledgeSourceReference,
    KnowledgeRetrievalOutputMode,
    KnowledgeRetrievalMinimalReasoningEffort
)
from azure.search.documents.indexes import SearchIndexClient

knowledge_base = KnowledgeBase(
    name=base_name,
    knowledge_sources=[
        KnowledgeSourceReference(name=knowledge_source_name)
    ],
    output_mode=KnowledgeRetrievalOutputMode.EXTRACTIVE_DATA,
    retrieval_reasoning_effort=KnowledgeRetrievalMinimalReasoningEffort()
)

index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
index_client.create_or_update_knowledge_base(knowledge_base=knowledge_base)
print(f"Knowledge base '{base_name}' created or updated successfully")

mcp_endpoint = f"{endpoint}/knowledgebases/{base_name}/mcp?api-version={api_version}"

## Create an MCP Tool Connection

In [None]:
from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient
from azure.mgmt.cognitiveservices.models import (
    ConnectionPropertiesV2BasicResource,
    CustomKeysConnectionProperties,
    CustomKeys
)

mgmt_client = CognitiveServicesManagementClient(credential, subscription_id)
resource = mgmt_client.project_connections.create(
    resource_group_name=resource_group,
    account_name=account_name,
    project_name=project_name,
    connection_name=project_connection_name,
    connection=ConnectionPropertiesV2BasicResource(
        properties=CustomKeysConnectionProperties(
            category="RemoteTool",
            target=mcp_endpoint,
            is_shared_to_all=True,
            metadata={"ApiType": "Azure"},
            credentials=CustomKeys(
                keys={"api-key": search_api_key}
            )
        )
    )
)

print(f"Connection '{resource.name}' created or updated successfully.")

## Create an Azure AI Agent

In [None]:
from azure.ai.projects import AIProjectClient
from azure.ai.projects.models import PromptAgentDefinition, MCPTool

project_client = AIProjectClient(endpoint=project_endpoint, credential=credential)

instructions = """
A Q&A agent that can answer questions from OneLake business documents.
Always provide references to the documents used to answer questions.
If you do not have the answer, respond with "I don't know".
"""

mcp_kb_tool = MCPTool(
    server_label="knowledge-base",
    server_url=mcp_endpoint,
    require_approval="never",
    allowed_tools=["knowledge_base_retrieve"],
    project_connection_id=project_connection_name
)

agent = project_client.agents.create_version(
    agent_name=agent_name,
    definition=PromptAgentDefinition(
        model=agent_model,
        instructions=instructions,
        tools=[mcp_kb_tool]
    )
)

print(f"AI agent '{agent_name}' created or updated successfully")

## Query OneLake documents

In [None]:
openai_client = project_client.get_openai_client()

conversation = openai_client.conversations.create()

response = openai_client.responses.create(
    conversation=conversation.id,
    input="What was the total revenue in Q1 2024 and how did different regions perform?",
    extra_body={"agent": {"name": agent.name, "type": "agent_reference"}},
)

print(f"Response: {response.output_text}")

## Clean up resources

### Delete the agent

In [None]:
project_client.agents.delete_version(agent.name, agent.version)
print(f"AI agent '{agent.name}' version '{agent.version}' deleted successfully")

### Delete the knowledge base

In [None]:
index_client.delete_knowledge_base(base_name)
print(f"Knowledge base '{base_name}' deleted successfully")

### Delete the knowledge source

In [None]:
index_client.delete_knowledge_source(knowledge_source=knowledge_source_name)
print(f"Knowledge source '{knowledge_source_name}' deleted successfully.")