# 09 - Lakehouse Data Binding

**Epic:** F5 - Fabric Ontology Integration  
**Feature:** F5.3 - Lakehouse Data Binding  
**Priority:** P1

## Purpose

Generate and upload data bindings that connect Ontology entity types to Lakehouse Delta tables. This allows the Ontology to materialize as a queryable Graph.

## Input

- Ontology configuration from `Files/config/ontology_config.json`
- Entity/relationship type definitions from `Files/ontology_definitions/`
- Gold tables: `gold_nodes`, `gold_edges`

## Output

- Data bindings uploaded to Ontology via REST API
- Ontology connected to Lakehouse data

## Binding Types

| Type | Source | Purpose |
|------|--------|----------|
| Static (NonTimeSeries) | gold_nodes | Entity instance data |
| Relationship | gold_edges | Relationships between entities |

## Setup

In [None]:
import json
import os
import base64
import time
import requests
from datetime import datetime
from typing import Optional, List, Dict
from pyspark.sql import functions as F

In [None]:
# Fabric notebookutils for authentication
from notebookutils import mssparkutils

## Configuration

In [None]:
# Fabric API configuration
FABRIC_API_BASE = "https://api.fabric.microsoft.com"
FABRIC_API_VERSION = "v1"

# Paths
DEFINITIONS_DIR = "/lakehouse/default/Files/ontology_definitions"
CONFIG_DIR = "/lakehouse/default/Files/config"

# Gold tables to bind
NODES_TABLE = "gold_nodes"
EDGES_TABLE = "gold_edges"

# Retry configuration
MAX_RETRIES = 3
RETRY_DELAY_SECONDS = 5
LRO_POLL_INTERVAL_SECONDS = 2
LRO_MAX_WAIT_SECONDS = 300

## Load Configuration

In [None]:
# Load ontology configuration from previous notebook
config_path = os.path.join(CONFIG_DIR, "ontology_config.json")

if not os.path.exists(config_path):
    raise FileNotFoundError(
        f"Ontology config not found at {config_path}. "
        "Please run notebook 08 first."
    )

with open(config_path, 'r') as f:
    config = json.load(f)

ontology_id = config["ontology_id"]
workspace_id = config["workspace_id"]
lakehouse_id = config["lakehouse_id"]

print(f"Ontology ID: {ontology_id}")
print(f"Workspace ID: {workspace_id}")
print(f"Lakehouse ID: {lakehouse_id}")

In [None]:
# Load entity and relationship types from the UPLOADED ontology via API
# This ensures IDs match exactly what Fabric has registered

def get_ontology_definition_from_api(workspace_id: str, ontology_id: str) -> dict:
    """Fetch the current ontology definition from Fabric API."""
    token = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    
    # Get definition (may be async)
    url = f"{FABRIC_API_BASE}/{FABRIC_API_VERSION}/workspaces/{workspace_id}/ontologies/{ontology_id}/getDefinition"
    response = requests.post(url, headers=headers, timeout=120)
    
    if response.status_code == 202:
        # Long-running operation - poll for completion
        operation_url = response.headers.get("Location")
        print(f"  Async operation, polling...")
        max_wait = 120
        start = time.time()
        while time.time() - start < max_wait:
            r = requests.get(operation_url, headers=headers, timeout=60)
            if r.status_code == 200:
                result = r.json()
                status = result.get("status")
                if status in ["Succeeded", "Completed"]:
                    print(f"  LRO completed, fetching definition...")
                    # After LRO completes, re-call getDefinition - it should return 200 now
                    final_response = requests.post(url, headers=headers, timeout=120)
                    if final_response.status_code == 200:
                        return final_response.json()
                    else:
                        print(f"  Re-fetch returned {final_response.status_code}")
                        # Try to get it from the result
                        if "definition" in result:
                            return result
                        raise Exception(f"Could not get definition after LRO: {final_response.status_code}")
                elif status in ["Failed", "Cancelled"]:
                    raise Exception(f"Get definition failed: {result}")
            time.sleep(2)
        raise TimeoutError("Get definition timed out")
    elif response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Get definition failed: {response.status_code} - {response.text}")

def parse_ontology_definition(api_response: dict) -> tuple:
    """Parse entity types and relationship types from API response."""
    import base64
    
    entity_types = []
    relationship_types = []
    
    parts = api_response.get("definition", {}).get("parts", [])
    
    print(f"  Found {len(parts)} parts in definition")
    
    # Debug: show first few paths to understand format
    sample_paths = [p.get("path", "") for p in parts[:5]]
    print(f"  Sample paths: {sample_paths}")
    
    for part in parts:
        path = part.get("path", "")
        payload = part.get("payload", "")
        
        try:
            content = json.loads(base64.b64decode(payload).decode("utf-8"))
        except:
            continue
        
        # Entity type definition files: EntityTypes/{id}/definition.json
        # Handle both with and without leading slash
        if "EntityTypes/" in path and path.endswith("/definition.json"):
            # Extract entity type with properties - convert IDs to strings for consistency
            entity_types.append({
                "id": str(content.get("id")),
                "name": content.get("name"),
                "namespace": content.get("namespace"),
                "properties": [
                    {**p, "id": str(p.get("id"))} for p in content.get("properties", [])
                ]
            })
        
        # Relationship type definition files: RelationshipTypes/{id}/definition.json
        elif "RelationshipTypes/" in path and path.endswith("/definition.json"):
            source = content.get("source", {})
            target = content.get("target", {})
            relationship_types.append({
                "id": str(content.get("id")),
                "name": content.get("name"),
                "namespace": content.get("namespace"),
                "source": {"entityTypeId": str(source.get("entityTypeId"))} if source.get("entityTypeId") else {},
                "target": {"entityTypeId": str(target.get("entityTypeId"))} if target.get("entityTypeId") else {}
            })
    
    return entity_types, relationship_types

# Fetch and parse the ontology definition
print("Fetching ontology definition from Fabric API...")
api_response = get_ontology_definition_from_api(workspace_id, ontology_id)

# Debug: show response structure
print(f"  Response keys: {list(api_response.keys())}")
if "definition" in api_response:
    print(f"  Definition keys: {list(api_response['definition'].keys())}")

entity_types, relationship_types = parse_ontology_definition(api_response)

print(f"Loaded {len(entity_types)} entity types from ontology")
print(f"Loaded {len(relationship_types)} relationship types from ontology")

# Show sample entity type
if entity_types:
    sample = entity_types[0]
    print(f"\nSample entity type:")
    print(f"  ID: {sample['id']}")
    print(f"  Name: {sample['name']}")
    print(f"  Properties: {len(sample.get('properties', []))} defined")

## Authentication

In [None]:
def get_fabric_token() -> str:
    """Get Entra ID token for Fabric API."""
    return mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")

def get_headers() -> dict:
    """Get HTTP headers with authorization token."""
    return {
        "Authorization": f"Bearer {get_fabric_token()}",
        "Content-Type": "application/json"
    }

# Test authentication
token = get_fabric_token()
print(f"Authentication OK (token length: {len(token)})")

## API Helper Functions

In [None]:
def api_request(
    method: str,
    endpoint: str,
    data: Optional[dict] = None,
    params: Optional[dict] = None,
    timeout: int = 60
) -> requests.Response:
    """
    Make an API request with retry logic and configurable timeout.
    
    Args:
        timeout: Request timeout in seconds (default 60, use longer for large payloads)
    """
    url = f"{FABRIC_API_BASE}/{endpoint}"
    current_timeout = timeout
    
    for attempt in range(MAX_RETRIES):
        try:
            headers = get_headers()
            response = requests.request(
                method=method,
                url=url,
                headers=headers,
                json=data,
                params=params,
                timeout=current_timeout
            )
            
            if response.status_code == 429:
                retry_after = int(response.headers.get("Retry-After", RETRY_DELAY_SECONDS))
                print(f"Rate limited. Waiting {retry_after}s...")
                time.sleep(retry_after)
                continue
            
            if response.status_code >= 500:
                print(f"Server error {response.status_code}. Retrying...")
                time.sleep(RETRY_DELAY_SECONDS)
                continue
            
            return response
            
        except requests.exceptions.ReadTimeout:
            # Double timeout on each retry for large payloads
            current_timeout = min(current_timeout * 2, 600)  # Max 10 minutes
            print(f"Read timeout. Retrying with {current_timeout}s timeout...")
            if attempt >= MAX_RETRIES - 1:
                raise
            continue
            
        except requests.exceptions.RequestException as e:
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY_SECONDS)
            else:
                raise
    
    return response


def wait_for_lro(operation_url: str, max_wait: int = None) -> dict:
    """Wait for a long-running operation to complete."""
    max_wait = max_wait or LRO_MAX_WAIT_SECONDS
    start_time = time.time()
    
    while time.time() - start_time < max_wait:
        headers = get_headers()
        response = requests.get(operation_url, headers=headers, timeout=120)
        
        if response.status_code == 200:
            result = response.json()
            status = result.get("status", "Unknown")
            
            if status in ["Succeeded", "Completed"]:
                return result
            elif status in ["Failed", "Cancelled"]:
                raise Exception(f"Operation {status}: {result.get('error', {})}")
            else:
                print(f"Operation status: {status}")
        
        time.sleep(LRO_POLL_INTERVAL_SECONDS)
    
    raise TimeoutError(f"Operation timed out after {max_wait}s")

## Analyze Gold Tables

In [None]:
# Check gold_nodes table structure
df_nodes = spark.table(NODES_TABLE)
print(f"gold_nodes schema:")
df_nodes.printSchema()

node_count = df_nodes.count()
print(f"\nTotal nodes: {node_count}")

# Get unique labels
labels = df_nodes.select(F.explode("labels").alias("label")).distinct().collect()
print(f"Node labels: {[row['label'] for row in labels]}")

In [None]:
# Check gold_edges table structure
df_edges = spark.table(EDGES_TABLE)
print(f"gold_edges schema:")
df_edges.printSchema()

edge_count = df_edges.count()
print(f"\nTotal edges: {edge_count}")

# Get unique edge types
edge_types = df_edges.select("type").distinct().collect()
print(f"Edge types: {[row['type'] for row in edge_types]}")

## Generate Data Binding Definitions

In [None]:
def generate_entity_binding(
    entity_type_id: str,
    entity_type_name: str,
    properties: List[Dict],
    workspace_id: str,
    lakehouse_id: str,
    table_name: str
) -> dict:
    """
    Generate a data binding configuration for an entity type.
    
    Per Fabric Ontology spec, DataBinding format:
    - dataBindingConfiguration.dataBindingType: "NonTimeSeries" or "TimeSeries"
    - dataBindingConfiguration.propertyBindings[]: sourceColumnName + targetPropertyId
    - dataBindingConfiguration.sourceTableProperties: sourceType, workspaceId, itemId, sourceTableName, sourceSchema
    """
    import uuid
    binding_id = str(uuid.uuid4())
    
    # Map properties to columns in the gold_nodes table
    # gold_nodes has: id, labels[], properties{}
    property_bindings = []
    
    for prop in properties:
        prop_name = prop["name"]
        prop_id = prop["id"]
        
        # The 'uri' property maps to 'id' column
        if prop_name == "uri":
            property_bindings.append({
                "sourceColumnName": "id",
                "targetPropertyId": prop_id  # Must be targetPropertyId, not propertyId
            })
        # Other properties would need flattened columns in the source table
        # TODO: Create entity-specific views with flattened properties
    
    binding = {
        "id": binding_id,
        "dataBindingConfiguration": {
            "dataBindingType": "NonTimeSeries",
            "propertyBindings": property_bindings,
            "sourceTableProperties": {
                "sourceType": "LakehouseTable",
                "workspaceId": workspace_id,
                "itemId": lakehouse_id,  # itemId = Lakehouse artifact ID
                "sourceTableName": table_name,
                "sourceSchema": "dbo"
            }
        }
    }
    
    return binding


# Generate bindings for each entity type
entity_bindings = []

for et in entity_types:
    binding = generate_entity_binding(
        entity_type_id=et["id"],
        entity_type_name=et["name"],
        properties=et.get("properties", []),
        workspace_id=workspace_id,
        lakehouse_id=lakehouse_id,
        table_name=NODES_TABLE
    )
    entity_bindings.append({
        "entity_type_id": et["id"],
        "entity_type_name": et["name"],
        "binding": binding
    })

print(f"Generated {len(entity_bindings)} entity type bindings")

In [None]:
# Show sample binding
if entity_bindings:
    print("Sample entity binding:")
    print(json.dumps(entity_bindings[0], indent=2))

In [None]:
def generate_relationship_contextualization(
    relationship_type_id: str,
    relationship_type_name: str,
    source_entity_id: str,
    target_entity_id: str,
    source_uri_prop_id: str,
    target_uri_prop_id: str,
    workspace_id: str,
    lakehouse_id: str,
    table_name: str
) -> dict:
    """
    Generate a relationship contextualization configuration.
    
    Per Fabric Ontology spec, Contextualization format:
    - dataBindingTable: source table info
    - sourceKeyRefBindings[]: sourceColumnName + targetPropertyId (maps source entity key)
    - targetKeyRefBindings[]: sourceColumnName + targetPropertyId (maps target entity key)
    """
    import uuid
    binding_id = str(uuid.uuid4())
    
    contextualization = {
        "id": binding_id,
        "dataBindingTable": {
            "sourceType": "LakehouseTable",
            "workspaceId": workspace_id,
            "itemId": lakehouse_id,
            "sourceTableName": table_name,
            "sourceSchema": "dbo"
        },
        "sourceKeyRefBindings": [
            {
                "sourceColumnName": "source_id",  # gold_edges.source_id column
                "targetPropertyId": source_uri_prop_id  # Maps to source entity's uri property
            }
        ],
        "targetKeyRefBindings": [
            {
                "sourceColumnName": "target_id",  # gold_edges.target_id column
                "targetPropertyId": target_uri_prop_id  # Maps to target entity's uri property
            }
        ]
    }
    
    return contextualization


# Build lookup for entity URI property IDs
# Each entity's 'uri' property is needed for relationship key bindings
entity_uri_props = {}
for et in entity_types:
    for prop in et.get("properties", []):
        if prop["name"] == "uri":
            entity_uri_props[et["id"]] = prop["id"]
            break

print(f"Entity URI property lookup: {len(entity_uri_props)} entries")

# Generate contextualizations for each relationship type
relationship_bindings = []

for rt in relationship_types:
    # Source and target are objects with entityTypeId from API
    source_id = rt.get("source", {}).get("entityTypeId")
    target_id = rt.get("target", {}).get("entityTypeId")
    
    # Get URI property IDs for source and target entities
    source_uri_prop = entity_uri_props.get(source_id)
    target_uri_prop = entity_uri_props.get(target_id)
    
    if not source_uri_prop or not target_uri_prop:
        print(f"  Warning: Skipping '{rt['name']}' - missing URI property for source or target")
        continue
    
    binding = generate_relationship_contextualization(
        relationship_type_id=rt["id"],
        relationship_type_name=rt["name"],
        source_entity_id=source_id,
        target_entity_id=target_id,
        source_uri_prop_id=source_uri_prop,
        target_uri_prop_id=target_uri_prop,
        workspace_id=workspace_id,
        lakehouse_id=lakehouse_id,
        table_name=EDGES_TABLE
    )
    relationship_bindings.append({
        "relationship_type_id": rt["id"],
        "relationship_type_name": rt["name"],
        "binding": binding
    })

print(f"Generated {len(relationship_bindings)} relationship type contextualizations")

## Encode Bindings as Definition Parts

In [None]:
def encode_payload(data: dict) -> str:
    """Encode a dictionary as base64 JSON string."""
    json_str = json.dumps(data, indent=2)
    return base64.b64encode(json_str.encode('utf-8')).decode('utf-8')


# Create definition parts for data bindings
# IMPORTANT: Must include .platform and definition.json in every updateDefinition call
binding_parts = []

# Add required .platform metadata
platform_metadata = {
    "metadata": {
        "type": "Ontology",
        "displayName": "RDF Translated Ontology"
    }
}
binding_parts.append({
    "path": ".platform",
    "payload": encode_payload(platform_metadata),
    "payloadType": "InlineBase64"
})

# Add required empty definition.json (per Fabric Ontology spec)
binding_parts.append({
    "path": "definition.json",
    "payload": encode_payload({}),
    "payloadType": "InlineBase64"
})

# Add entity type data bindings
for eb in entity_bindings:
    path = f"EntityTypes/{eb['entity_type_id']}/DataBindings/{eb['binding']['id']}.json"
    binding_parts.append({
        "path": path,
        "payload": encode_payload(eb['binding']),
        "payloadType": "InlineBase64"
    })

# Add relationship type data bindings (note: per Fabric spec, relationships use Contextualizations, not DataBindings)
for rb in relationship_bindings:
    path = f"RelationshipTypes/{rb['relationship_type_id']}/Contextualizations/{rb['binding']['id']}.json"
    binding_parts.append({
        "path": path,
        "payload": encode_payload(rb['binding']),
        "payloadType": "InlineBase64"
    })

print(f"Total binding parts to upload: {len(binding_parts)}")
print(f"  - Base files: 2 (.platform, definition.json)")
print(f"  - Entity bindings: {len(entity_bindings)}")
print(f"  - Relationship bindings: {len(relationship_bindings)}")

## Upload Data Bindings

In [None]:
def update_ontology_definition(workspace_id: str, ontology_id: str, definition_parts: list) -> dict:
    """
    Update the ontology definition with data bindings.
    Uses longer timeout for large payloads.
    """
    endpoint = f"{FABRIC_API_VERSION}/workspaces/{workspace_id}/ontologies/{ontology_id}/updateDefinition"
    
    data = {
        "definition": {
            "parts": definition_parts
        }
    }
    
    # Calculate timeout based on number of parts (at least 5 minutes, 2s per part)
    upload_timeout = max(300, len(definition_parts) * 2)
    print(f"Using {upload_timeout}s timeout for {len(definition_parts)} parts...")
    
    response = api_request("POST", endpoint, data=data, timeout=upload_timeout)
    
    if response.status_code == 200:
        print("Definition updated successfully")
        return response.json()
    elif response.status_code == 202:
        operation_url = response.headers.get("Location")
        print(f"Update is async. Polling for completion...")
        if operation_url:
            # Wait longer for LRO based on parts count
            lro_timeout = max(600, len(definition_parts) * 3)
            return wait_for_lro(operation_url, max_wait=lro_timeout)
    
    print(f"Failed to update: {response.status_code}")
    print(response.text)
    raise Exception(f"Update failed: {response.status_code} - {response.text}")

In [None]:
# Upload the data binding parts
print(f"Uploading {len(binding_parts)} data binding definitions...")

try:
    result = update_ontology_definition(workspace_id, ontology_id, binding_parts)
    print("\nData bindings uploaded successfully!")
except Exception as e:
    print(f"\nFailed to upload data bindings: {e}")
    print("\nNote: Data binding may require:")
    print("  - OneLake security disabled on lakehouse")
    print("  - Managed Delta tables (not external)")
    print("  - Column mapping disabled on tables")
    raise

## Save Binding Configuration

In [None]:
# Save binding configuration for reference
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

binding_config = {
    "created_at": datetime.now().isoformat(),
    "ontology_id": ontology_id,
    "workspace_id": workspace_id,
    "lakehouse_id": lakehouse_id,
    "nodes_table": NODES_TABLE,
    "edges_table": EDGES_TABLE,
    "entity_bindings_count": len(entity_bindings),
    "relationship_bindings_count": len(relationship_bindings),
    "entity_bindings": entity_bindings,
    "relationship_bindings": relationship_bindings
}

binding_config_path = os.path.join(DEFINITIONS_DIR, f"data_bindings_{timestamp}.json")
with open(binding_config_path, 'w') as f:
    json.dump(binding_config, f, indent=2)

print(f"Saved binding configuration to: {binding_config_path}")

## Summary

In [None]:
print("="*60)
print("Lakehouse Data Binding Complete")
print("="*60)
print(f"\nOntology ID: {ontology_id}")
print(f"Lakehouse ID: {lakehouse_id}")
print(f"\nEntity Type Bindings: {len(entity_bindings)}")
print(f"Relationship Bindings: {len(relationship_bindings)}")
print(f"\nSource Tables:")
print(f"  Nodes: {NODES_TABLE} ({node_count} rows)")
print(f"  Edges: {EDGES_TABLE} ({edge_count} rows)")

print(f"\n" + "="*60)
print("Next Steps:")
print("="*60)
print("1. Go to Fabric portal → Ontology → View your ontology")
print("2. Verify entity types show data counts")
print("3. Query the materialized Graph!")
print("4. (Optional) Connect Data Agent for NL2Ontology queries")