# 09 - Lakehouse Data Binding

**Epic:** F5 - Fabric Ontology Integration  
**Feature:** F5.3 - Lakehouse Data Binding  
**Priority:** P1

## Purpose

Generate and upload data bindings that connect Ontology entity types to Lakehouse Delta tables. This allows the Ontology to materialize as a queryable Graph.

## Input

- Ontology configuration from `Files/config/ontology_config.json`
- Entity/relationship type definitions from `Files/ontology_definitions/`
- Gold tables: `gold_nodes`, `gold_edges`

## Output

- Data bindings uploaded to Ontology via REST API
- Ontology connected to Lakehouse data

## Binding Types

| Type | Source | Purpose |
|------|--------|----------|
| Static (NonTimeSeries) | gold_nodes | Entity instance data |
| Relationship | gold_edges | Relationships between entities |

## Setup

In [None]:
import json
import os
import base64
import time
import requests
from datetime import datetime
from typing import Optional, List, Dict
from pyspark.sql import functions as F

In [None]:
# Fabric notebookutils for authentication
from notebookutils import mssparkutils

## Configuration

In [None]:
# Fabric API configuration
FABRIC_API_BASE = "https://api.fabric.microsoft.com"
FABRIC_API_VERSION = "v1"

# Paths
DEFINITIONS_DIR = "/lakehouse/default/Files/ontology_definitions"
CONFIG_DIR = "/lakehouse/default/Files/config"

# Gold tables to bind
NODES_TABLE = "gold_nodes"
EDGES_TABLE = "gold_edges"

# Retry configuration
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 5
LRO_POLL_INTERVAL_SECONDS = 2
LRO_MAX_WAIT_SECONDS = 300

## Load Configuration

In [None]:
# Load ontology configuration from previous notebook
config_path = os.path.join(CONFIG_DIR, "ontology_config.json")

if not os.path.exists(config_path):
    raise FileNotFoundError(
        f"Ontology config not found at {config_path}. "
        "Please run notebook 08 first."
    )

with open(config_path, 'r') as f:
    config = json.load(f)

ontology_id = config["ontology_id"]
workspace_id = config["workspace_id"]
lakehouse_id = config["lakehouse_id"]

print(f"Ontology ID: {ontology_id}")
print(f"Workspace ID: {workspace_id}")
print(f"Lakehouse ID: {lakehouse_id}")

In [None]:
# Load entity type definitions (human-readable format)
def get_latest_file(prefix: str) -> str:
    files = os.listdir(DEFINITIONS_DIR)
    matching = [f for f in files if f.startswith(prefix) and f.endswith(".json")]
    if not matching:
        raise FileNotFoundError(f"No files matching {prefix}*.json")
    matching.sort(reverse=True)
    return os.path.join(DEFINITIONS_DIR, matching[0])

entity_file = get_latest_file("entity_types_")
with open(entity_file, 'r') as f:
    entity_types = json.load(f)

print(f"Loaded {len(entity_types)} entity types from {entity_file}")

# Load relationship type definitions
rel_file = get_latest_file("relationship_types_")
with open(rel_file, 'r') as f:
    relationship_types = json.load(f)

print(f"Loaded {len(relationship_types)} relationship types from {rel_file}")

## Authentication

In [None]:
def get_fabric_token() -> str:
    """Get Entra ID token for Fabric API."""
    return mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")

def get_headers() -> dict:
    """Get HTTP headers with authorization token."""
    return {
        "Authorization": f"Bearer {get_fabric_token()}",
        "Content-Type": "application/json"
    }

# Test authentication
token = get_fabric_token()
print(f"Authentication OK (token length: {len(token)})")

## API Helper Functions

In [None]:
def api_request(
    method: str,
    endpoint: str,
    data: Optional[dict] = None,
    params: Optional[dict] = None
) -> requests.Response:
    """
    Make an API request with retry logic.
    """
    url = f"{FABRIC_API_BASE}/{endpoint}"
    
    for attempt in range(MAX_RETRIES):
        try:
            headers = get_headers()
            response = requests.request(
                method=method,
                url=url,
                headers=headers,
                json=data,
                params=params,
                timeout=60
            )
            
            if response.status_code == 429:
                retry_after = int(response.headers.get("Retry-After", RETRY_DELAY_SECONDS))
                print(f"Rate limited. Waiting {retry_after}s...")
                time.sleep(retry_after)
                continue
            
            if response.status_code >= 500:
                print(f"Server error {response.status_code}. Retrying...")
                time.sleep(RETRY_DELAY_SECONDS)
                continue
            
            return response
            
        except requests.exceptions.RequestException as e:
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                raise
    
    return response


def wait_for_lro(operation_url: str) -> dict:
    """Wait for a long-running operation to complete."""
    start_time = time.time()
    
    while time.time() - start_time < LRO_MAX_WAIT_SECONDS:
        headers = get_headers()
        response = requests.get(operation_url, headers=headers, timeout=60)
        
        if response.status_code == 200:
            result = response.json()
            status = result.get("status", "Unknown")
            
            if status in ["Succeeded", "Completed"]:
                return result
            elif status in ["Failed", "Cancelled"]:
                raise Exception(f"Operation {status}: {result.get('error', {})}")
        
        time.sleep(LRO_POLL_INTERVAL_SECONDS)
    
    raise TimeoutError(f"Operation timed out after {LRO_MAX_WAIT_SECONDS}s")

## Analyze Gold Tables

In [None]:
# Check gold_nodes table structure
df_nodes = spark.table(NODES_TABLE)
print(f"gold_nodes schema:")
df_nodes.printSchema()

node_count = df_nodes.count()
print(f"\nTotal nodes: {node_count}")

# Get unique labels
labels = df_nodes.select(F.explode("labels").alias("label")).distinct().collect()
print(f"Node labels: {[row['label'] for row in labels]}")

In [None]:
# Check gold_edges table structure
df_edges = spark.table(EDGES_TABLE)
print(f"gold_edges schema:")
df_edges.printSchema()

edge_count = df_edges.count()
print(f"\nTotal edges: {edge_count}")

# Get unique edge types
edge_types = df_edges.select("type").distinct().collect()
print(f"Edge types: {[row['type'] for row in edge_types]}")

## Generate Data Binding Definitions

In [None]:
def generate_entity_binding(
    entity_type_id: str,
    entity_type_name: str,
    properties: List[Dict],
    workspace_id: str,
    lakehouse_id: str,
    table_name: str
) -> dict:
    """
    Generate a data binding configuration for an entity type.
    
    Args:
        entity_type_id: ID of the entity type
        entity_type_name: Name of the entity type (used to filter gold_nodes)
        properties: List of property definitions with 'id' and 'name'
        workspace_id: Workspace ID
        lakehouse_id: Lakehouse ID
        table_name: Source table name
    
    Returns:
        Data binding JSON structure
    """
    import uuid
    binding_id = str(uuid.uuid4()).replace("-", "")[:13]
    
    # Map properties to columns in the gold_nodes table
    # gold_nodes has: id, labels[], properties{}
    # We need to map entity properties to columns in the source table
    property_bindings = []
    
    for prop in properties:
        prop_name = prop["name"]
        prop_id = prop["id"]
        
        # The 'uri' property maps to 'id' column
        if prop_name == "uri":
            property_bindings.append({
                "sourceColumnName": "id",
                "propertyId": prop_id
            })
        else:
            # Other properties come from the 'properties' map column
            # Fabric requires property columns to be top-level
            # This means we need to flatten the properties map first
            # For now, skip non-uri properties
            # TODO: Add support for flattened property columns
            pass
    
    binding = {
        "id": binding_id,
        "dataBindingConfiguration": {
            "dataBindingType": "NonTimeSeries",
            "propertyBindings": property_bindings,
            "entityKeyPropertyId": next(
                (p["id"] for p in properties if p["name"] == "uri"), 
                properties[0]["id"] if properties else None
            ),
            "displayNamePropertyId": next(
                (p["id"] for p in properties if p["name"] == "uri"),
                properties[0]["id"] if properties else None
            ),
            "dataSource": {
                "workspaceId": workspace_id,
                "lakehouseId": lakehouse_id,
                "tableName": table_name
            },
            # Filter to only this entity type's label
            "entityFilter": f"array_contains(labels, '{entity_type_name}')"
        }
    }
    
    return binding


# Generate bindings for each entity type
entity_bindings = []

for et in entity_types:
    binding = generate_entity_binding(
        entity_type_id=et["id"],
        entity_type_name=et["name"],
        properties=et.get("properties", []),
        workspace_id=workspace_id,
        lakehouse_id=lakehouse_id,
        table_name=NODES_TABLE
    )
    entity_bindings.append({
        "entity_type_id": et["id"],
        "entity_type_name": et["name"],
        "binding": binding
    })

print(f"Generated {len(entity_bindings)} entity type bindings")

In [None]:
# Show sample binding
if entity_bindings:
    print("Sample entity binding:")
    print(json.dumps(entity_bindings[0], indent=2))

In [None]:
def generate_relationship_binding(
    relationship_type_id: str,
    relationship_type_name: str,
    workspace_id: str,
    lakehouse_id: str,
    table_name: str
) -> dict:
    """
    Generate a relationship binding configuration.
    
    Args:
        relationship_type_id: ID of the relationship type
        relationship_type_name: Name (used to filter gold_edges)
        workspace_id: Workspace ID
        lakehouse_id: Lakehouse ID
        table_name: Source table name (gold_edges)
    
    Returns:
        Relationship binding JSON structure
    """
    import uuid
    binding_id = str(uuid.uuid4()).replace("-", "")[:13]
    
    binding = {
        "id": binding_id,
        "relationshipBindingConfiguration": {
            "sourceEntityKeyColumn": "source_id",
            "targetEntityKeyColumn": "target_id",
            "dataSource": {
                "workspaceId": workspace_id,
                "lakehouseId": lakehouse_id,
                "tableName": table_name
            },
            # Filter to only this relationship type
            "relationshipFilter": f"type = '{relationship_type_name}'"
        }
    }
    
    return binding


# Generate bindings for each relationship type
relationship_bindings = []

for rt in relationship_types:
    binding = generate_relationship_binding(
        relationship_type_id=rt["id"],
        relationship_type_name=rt["name"],
        workspace_id=workspace_id,
        lakehouse_id=lakehouse_id,
        table_name=EDGES_TABLE
    )
    relationship_bindings.append({
        "relationship_type_id": rt["id"],
        "relationship_type_name": rt["name"],
        "binding": binding
    })

print(f"Generated {len(relationship_bindings)} relationship type bindings")

## Encode Bindings as Definition Parts

In [None]:
def encode_payload(data: dict) -> str:
    """Encode a dictionary as base64 JSON string."""
    json_str = json.dumps(data, indent=2)
    return base64.b64encode(json_str.encode('utf-8')).decode('utf-8')


# Create definition parts for data bindings
binding_parts = []

# Add entity type data bindings
for eb in entity_bindings:
    path = f"EntityTypes/{eb['entity_type_id']}/DataBindings/{eb['binding']['id']}.json"
    binding_parts.append({
        "path": path,
        "payload": encode_payload(eb['binding']),
        "payloadType": "InlineBase64"
    })

# Add relationship type data bindings
for rb in relationship_bindings:
    path = f"RelationshipTypes/{rb['relationship_type_id']}/DataBindings/{rb['binding']['id']}.json"
    binding_parts.append({
        "path": path,
        "payload": encode_payload(rb['binding']),
        "payloadType": "InlineBase64"
    })

print(f"Total binding parts to upload: {len(binding_parts)}")

## Upload Data Bindings

In [None]:
def update_ontology_definition(workspace_id: str, ontology_id: str, definition_parts: list) -> dict:
    """
    Update the ontology definition with data bindings.
    """
    endpoint = f"{FABRIC_API_VERSION}/workspaces/{workspace_id}/ontologies/{ontology_id}/updateDefinition"
    
    data = {
        "definition": {
            "parts": definition_parts
        }
    }
    
    response = api_request("POST", endpoint, data=data)
    
    if response.status_code == 200:
        print("Definition updated successfully")
        return response.json()
    elif response.status_code == 202:
        operation_url = response.headers.get("Location")
        print(f"Update is async. Polling...")
        if operation_url:
            return wait_for_lro(operation_url)
    
    print(f"Failed to update: {response.status_code}")
    print(response.text)
    raise Exception(f"Update failed: {response.status_code} - {response.text}")

In [None]:
# Upload the data binding parts
print(f"Uploading {len(binding_parts)} data binding definitions...")

try:
    result = update_ontology_definition(workspace_id, ontology_id, binding_parts)
    print("\nData bindings uploaded successfully!")
except Exception as e:
    print(f"\nFailed to upload data bindings: {e}")
    print("\nNote: Data binding may require:")
    print("  - OneLake security disabled on lakehouse")
    print("  - Managed Delta tables (not external)")
    print("  - Column mapping disabled on tables")
    raise

## Save Binding Configuration

In [None]:
# Save binding configuration for reference
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

binding_config = {
    "created_at": datetime.now().isoformat(),
    "ontology_id": ontology_id,
    "workspace_id": workspace_id,
    "lakehouse_id": lakehouse_id,
    "nodes_table": NODES_TABLE,
    "edges_table": EDGES_TABLE,
    "entity_bindings_count": len(entity_bindings),
    "relationship_bindings_count": len(relationship_bindings),
    "entity_bindings": entity_bindings,
    "relationship_bindings": relationship_bindings
}

binding_config_path = os.path.join(DEFINITIONS_DIR, f"data_bindings_{timestamp}.json")
with open(binding_config_path, 'w') as f:
    json.dump(binding_config, f, indent=2)

print(f"Saved binding configuration to: {binding_config_path}")

## Summary

In [None]:
print("="*60)
print("Lakehouse Data Binding Complete")
print("="*60)
print(f"\nOntology ID: {ontology_id}")
print(f"Lakehouse ID: {lakehouse_id}")
print(f"\nEntity Type Bindings: {len(entity_bindings)}")
print(f"Relationship Bindings: {len(relationship_bindings)}")
print(f"\nSource Tables:")
print(f"  Nodes: {NODES_TABLE} ({node_count} rows)")
print(f"  Edges: {EDGES_TABLE} ({edge_count} rows)")

print(f"\n" + "="*60)
print("Next Steps:")
print("="*60)
print("1. Go to Fabric portal → Ontology → View your ontology")
print("2. Verify entity types show data counts")
print("3. Query the materialized Graph!")
print("4. (Optional) Connect Data Agent for NL2Ontology queries")