# 04 - Property to Node Property / Edge Mapping

Translates OWL properties to node properties and edge types.

**Input**: `bronze_triples` Delta table
**Output**: `silver_properties` Delta table

## Translation Rules

| RDF Construct | LPG Equivalent |
|---------------|----------------|
| owl:DatatypeProperty | Node property |
| owl:ObjectProperty | Edge type |
| rdfs:domain | Source node type(s) |
| rdfs:range (datatype) | Property data type |
| rdfs:range (class) | Target node type for edge |

In [None]:
# Configuration
INPUT_TABLE = "bronze_triples"
OUTPUT_TABLE = "silver_properties"

# Language preference for labels and descriptions (must match notebook 03)
# Fabric Ontology only supports one value per entity - this setting determines which language to use
PREFERRED_LANGUAGE = "en"  # Change to "nl" for Dutch

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType, BooleanType
from pyspark.sql.window import Window
from typing import List, Dict, Optional
import re
import json

# RDF namespace constants
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
RDF_PROPERTY = "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"
RDFS_DOMAIN = "http://www.w3.org/2000/01/rdf-schema#domain"
RDFS_RANGE = "http://www.w3.org/2000/01/rdf-schema#range"
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
RDFS_COMMENT = "http://www.w3.org/2000/01/rdf-schema#comment"
RDFS_SUBPROPERTY_OF = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf"

OWL_OBJECT_PROPERTY = "http://www.w3.org/2002/07/owl#ObjectProperty"
OWL_DATATYPE_PROPERTY = "http://www.w3.org/2002/07/owl#DatatypeProperty"
OWL_ANNOTATION_PROPERTY = "http://www.w3.org/2002/07/owl#AnnotationProperty"
OWL_FUNCTIONAL_PROPERTY = "http://www.w3.org/2002/07/owl#FunctionalProperty"
OWL_INVERSE_OF = "http://www.w3.org/2002/07/owl#inverseOf"

SKOS_PREFLABEL = "http://www.w3.org/2004/02/skos/core#prefLabel"
SKOS_DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"

# XSD datatypes prefix
XSD_PREFIX = "http://www.w3.org/2001/XMLSchema#"

# Common namespace prefixes for display
PREFIXES = {
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
    "http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
    "http://www.w3.org/2002/07/owl#": "owl:",
    "http://www.w3.org/2004/02/skos/core#": "skos:",
    "http://www.w3.org/ns/shacl#": "sh:",
    "http://www.w3.org/2001/XMLSchema#": "xsd:",
}

def shorten_uri(uri: str) -> str:
    """Convert full URI to prefixed form for display."""
    if uri is None:
        return None
    for full, prefix in PREFIXES.items():
        if uri.startswith(full):
            return prefix + uri[len(full):]
    return uri

def extract_local_name(uri: str) -> str:
    """Extract the local name from a URI."""
    if uri is None:
        return None
    if uri.startswith("_:"):
        return uri[2:]
    if "#" in uri:
        return uri.split("#")[-1]
    if "/" in uri:
        return uri.split("/")[-1]
    return uri

def sanitize_name(name: str) -> str:
    """Convert a name to a valid identifier (camelCase for properties)."""
    if name is None:
        return None
    cleaned = re.sub(r'[^a-zA-Z0-9]', ' ', name)
    words = cleaned.split()
    if not words:
        return "unknownProperty"
    # camelCase: first word lowercase, rest capitalized
    result = words[0].lower() + ''.join(word.capitalize() for word in words[1:])
    if result and not result[0].isalpha():
        result = "prop" + result
    return result

# Register UDFs
extract_local_name_udf = F.udf(extract_local_name, StringType())
sanitize_name_udf = F.udf(sanitize_name, StringType())
shorten_uri_udf = F.udf(shorten_uri, StringType())

In [None]:
# XSD to Spark/LPG type mapping
XSD_TYPE_MAP = {
    "http://www.w3.org/2001/XMLSchema#string": "string",
    "http://www.w3.org/2001/XMLSchema#boolean": "boolean",
    "http://www.w3.org/2001/XMLSchema#integer": "integer",
    "http://www.w3.org/2001/XMLSchema#int": "integer",
    "http://www.w3.org/2001/XMLSchema#long": "long",
    "http://www.w3.org/2001/XMLSchema#short": "integer",
    "http://www.w3.org/2001/XMLSchema#byte": "integer",
    "http://www.w3.org/2001/XMLSchema#decimal": "double",
    "http://www.w3.org/2001/XMLSchema#float": "float",
    "http://www.w3.org/2001/XMLSchema#double": "double",
    "http://www.w3.org/2001/XMLSchema#date": "date",
    "http://www.w3.org/2001/XMLSchema#dateTime": "timestamp",
    "http://www.w3.org/2001/XMLSchema#time": "string",  # Time as string
    "http://www.w3.org/2001/XMLSchema#duration": "string",
    "http://www.w3.org/2001/XMLSchema#anyURI": "string",
    "http://www.w3.org/2001/XMLSchema#positiveInteger": "integer",
    "http://www.w3.org/2001/XMLSchema#nonNegativeInteger": "integer",
    "http://www.w3.org/2001/XMLSchema#negativeInteger": "integer",
    "http://www.w3.org/2001/XMLSchema#nonPositiveInteger": "integer",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#langString": "string",
    "http://www.w3.org/2000/01/rdf-schema#Literal": "string",
}

def map_xsd_type(xsd_uri: str) -> str:
    """Map XSD datatype URI to LPG/Spark type."""
    if xsd_uri is None:
        return "string"  # Default to string
    return XSD_TYPE_MAP.get(xsd_uri, "string")

def is_datatype(range_uri: str) -> bool:
    """Check if range URI is a datatype (XSD or literal)."""
    if range_uri is None:
        return False
    return (
        range_uri.startswith(XSD_PREFIX) or
        range_uri in XSD_TYPE_MAP
    )

map_xsd_type_udf = F.udf(map_xsd_type, StringType())
is_datatype_udf = F.udf(is_datatype, BooleanType())

In [None]:
# Load bronze triples
df_triples = spark.table(INPUT_TABLE)
print(f"Loaded {df_triples.count()} triples from '{INPUT_TABLE}'")

In [None]:
# Find all properties (subjects typed as owl:ObjectProperty, owl:DatatypeProperty, etc.)
property_types = [OWL_OBJECT_PROPERTY, OWL_DATATYPE_PROPERTY, OWL_ANNOTATION_PROPERTY, RDF_PROPERTY]

df_properties_raw = df_triples.filter(
    (F.col("predicate") == RDF_TYPE) & 
    (F.col("object").isin(property_types))
).select(
    F.col("subject").alias("property_uri"),
    F.col("object").alias("property_type"),
    F.col("graph")
)

# Deduplicate: same property may be defined in multiple source files
# Priority: OWL types > RDFS types (more specific wins)
# Aggregate graphs for provenance
df_properties_ranked = df_properties_raw.withColumn(
    "type_priority",
    F.when(F.col("property_type") == OWL_OBJECT_PROPERTY, 1)
     .when(F.col("property_type") == OWL_DATATYPE_PROPERTY, 1)
     .when(F.col("property_type") == OWL_ANNOTATION_PROPERTY, 2)
     .when(F.col("property_type") == RDF_PROPERTY, 3)
     .otherwise(4)
)

# Group by property_uri, keep best type, aggregate graphs
window = Window.partitionBy("property_uri").orderBy("type_priority")
df_properties = df_properties_ranked \
    .withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") == 1) \
    .drop("rank", "type_priority")

# Get all source graphs for each property (for provenance)
df_property_graphs = df_properties_raw.groupBy("property_uri").agg(
    F.collect_set("graph").alias("source_graphs")
)
df_properties = df_properties.join(df_property_graphs, "property_uri", "left")

print(f"Found {df_properties.count()} unique properties")
print(f"(from {df_properties_raw.count()} raw type declarations across files)")
df_properties.groupBy("property_type").count().show()

In [None]:
# Get rdfs:domain for each property (can have multiple domains)
df_domains = df_triples.filter(
    F.col("predicate") == RDFS_DOMAIN
).select(
    F.col("subject").alias("property_uri"),
    F.col("object").alias("domain_uri")
)

# Filter out blank node domains - they're OWL restrictions, not real classes
blank_domain_count = df_domains.filter(F.col("domain_uri").startswith("_:")).count()
df_domains = df_domains.filter(~F.col("domain_uri").startswith("_:"))
print(f"Filtered out {blank_domain_count} blank node domain references")

# Aggregate all domains for each property
df_domains_agg = df_domains.groupBy("property_uri").agg(
    F.collect_set("domain_uri").alias("domain_uris")
)

print(f"Found {df_domains.count()} domain declarations (after filtering)")

In [None]:
# Get rdfs:range for each property (can have multiple ranges)
df_ranges = df_triples.filter(
    F.col("predicate") == RDFS_RANGE
).select(
    F.col("subject").alias("property_uri"),
    F.col("object").alias("range_uri")
)

# Filter out blank node ranges - they're OWL restrictions, not real classes
blank_range_count = df_ranges.filter(F.col("range_uri").startswith("_:")).count()
df_ranges = df_ranges.filter(~F.col("range_uri").startswith("_:"))
print(f"Filtered out {blank_range_count} blank node range references")

# Aggregate all ranges for each property
df_ranges_agg = df_ranges.groupBy("property_uri").agg(
    F.collect_set("range_uri").alias("range_uris")
)

print(f"Found {df_ranges.count()} range declarations (after filtering)")

In [None]:
# Get rdfs:label for each property (prefer configured language)
df_labels = df_triples.filter(
    (F.col("predicate") == RDFS_LABEL) | (F.col("predicate") == SKOS_PREFLABEL)
).select(
    F.col("subject").alias("property_uri"),
    F.col("object").alias("label"),
    F.col("lang")
)

# Rank labels by language preference (configured via PREFERRED_LANGUAGE)
df_labels = df_labels.withColumn(
    "lang_priority",
    F.when(F.lower(F.col("lang")) == PREFERRED_LANGUAGE.lower(), 1)  # Preferred language
     .when(F.col("lang").isNull(), 2)   # No language tag
     .when(F.col("lang") == "", 2)      # Empty string treated as no tag
     .otherwise(3)                       # Any other language
)

window = Window.partitionBy("property_uri").orderBy("lang_priority")
df_best_labels = df_labels.withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") == 1) \
    .select("property_uri", "label", "lang", "lang_priority")

print(f"Language preference: '{PREFERRED_LANGUAGE}'")
print(f"Found {df_best_labels.count()} properties with labels")

# Diagnostic: Show which properties fall back to non-preferred language
fallback_labels = df_best_labels.filter(F.col("lang_priority") == 3)
fallback_count = fallback_labels.count()
if fallback_count > 0:
    print(f"\n⚠ {fallback_count} properties have labels in non-preferred language (fallback):")
    fallback_labels.select("property_uri", "label", "lang").show(20, truncate=60)

# Drop extra columns for join
df_best_labels = df_best_labels.select("property_uri", "label")

In [None]:
# Get rdfs:comment / skos:definition for descriptions
df_comments = df_triples.filter(
    (F.col("predicate") == RDFS_COMMENT) | (F.col("predicate") == SKOS_DEFINITION)
).select(
    F.col("subject").alias("property_uri"),
    F.col("object").alias("description"),
    F.col("lang").alias("desc_lang")
)

# Rank descriptions by language preference (same logic as labels)
df_comments = df_comments.withColumn(
    "lang_priority",
    F.when(F.lower(F.col("desc_lang")) == PREFERRED_LANGUAGE.lower(), 1)  # Preferred language
     .when(F.col("desc_lang").isNull(), 2)  # No language tag
     .otherwise(3)                           # Any other language
)

window = Window.partitionBy("property_uri").orderBy("lang_priority")
df_best_descriptions = df_comments.withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") == 1) \
    .select("property_uri", "description")

print(f"Found {df_best_descriptions.count()} properties with descriptions")

In [None]:
# Get owl:inverseOf relationships
df_inverse = df_triples.filter(
    F.col("predicate") == OWL_INVERSE_OF
).select(
    F.col("subject").alias("property_uri"),
    F.col("object").alias("inverse_of_uri")
)

print(f"Found {df_inverse.count()} inverse property declarations")

In [None]:
# Check for functional properties
df_functional = df_triples.filter(
    (F.col("predicate") == RDF_TYPE) & 
    (F.col("object") == OWL_FUNCTIONAL_PROPERTY)
).select(
    F.col("subject").alias("property_uri"),
    F.lit(True).alias("is_functional")
)

print(f"Found {df_functional.count()} functional properties")

In [None]:
# Combine all property information
df_props = df_properties \
    .join(df_domains_agg, "property_uri", "left") \
    .join(df_ranges_agg, "property_uri", "left") \
    .join(df_best_labels, "property_uri", "left") \
    .join(df_best_descriptions, "property_uri", "left") \
    .join(df_inverse, "property_uri", "left") \
    .join(df_functional, "property_uri", "left")

# Generate property names
df_props = df_props \
    .withColumn("local_name", extract_local_name_udf(F.col("property_uri"))) \
    .withColumn("property_name", sanitize_name_udf(F.col("local_name"))) \
    .withColumn("display_name", F.coalesce(F.col("label"), F.col("local_name")))

print(f"Combined {df_props.count()} properties")

In [None]:
# Determine mapping type: node_property vs edge
# - ObjectProperty → edge
# - DatatypeProperty → node_property
# - AnnotationProperty → node_property (usually labels/comments)
# - rdf:Property with datatype range → node_property
# - rdf:Property with class range → edge

def determine_mapping_type(property_type: str, range_uris: list) -> str:
    """Determine if property maps to node property or edge."""
    if property_type == OWL_OBJECT_PROPERTY:
        return "edge"
    elif property_type == OWL_DATATYPE_PROPERTY:
        return "node_property"
    elif property_type == OWL_ANNOTATION_PROPERTY:
        return "node_property"  # Annotations are metadata
    elif property_type == RDF_PROPERTY:
        # Check range to determine type
        if range_uris:
            for r in range_uris:
                if is_datatype(r):
                    return "node_property"
            return "edge"  # Has range but not datatype → assume edge
        return "unknown"  # No range info
    return "unknown"

determine_mapping_type_udf = F.udf(determine_mapping_type, StringType())

df_props = df_props.withColumn(
    "mapping_type",
    determine_mapping_type_udf(F.col("property_type"), F.col("range_uris"))
)

In [None]:
# For node properties: extract data type from range
def extract_data_type(range_uris: list) -> str:
    """Extract Spark/LPG data type from range URIs."""
    if not range_uris:
        return "string"  # Default
    for r in range_uris:
        if r in XSD_TYPE_MAP:
            return XSD_TYPE_MAP[r]
        if r.startswith(XSD_PREFIX):
            return "string"  # Unknown XSD type, use string
    return "string"

extract_data_type_udf = F.udf(extract_data_type, StringType())

# For edges: extract target node types from range
def extract_target_types(range_uris: list) -> list:
    """Extract target node type names from range URIs (non-datatype ranges)."""
    if not range_uris:
        return None
    targets = []
    for r in range_uris:
        if not is_datatype(r):
            local = extract_local_name(r)
            if local:
                targets.append(sanitize_name(local).title().replace(" ", ""))  # PascalCase
    return targets if targets else None

extract_target_types_udf = F.udf(extract_target_types, ArrayType(StringType()))

# Extract source node types from domain
def extract_source_types(domain_uris: list) -> list:
    """Extract source node type names from domain URIs."""
    if not domain_uris:
        return None
    sources = []
    for d in domain_uris:
        local = extract_local_name(d)
        if local:
            # PascalCase for node types
            name = sanitize_name(local)
            sources.append(name[0].upper() + name[1:] if name else None)
    return sources if sources else None

extract_source_types_udf = F.udf(extract_source_types, ArrayType(StringType()))

df_props = df_props \
    .withColumn("data_type", extract_data_type_udf(F.col("range_uris"))) \
    .withColumn("target_types", extract_target_types_udf(F.col("range_uris"))) \
    .withColumn("source_types", extract_source_types_udf(F.col("domain_uris")))

In [None]:
# Select final columns for output (deduplicated - one row per property)
df_output = df_props.select(
    "property_uri",
    "property_name",
    "display_name",
    "description",
    "mapping_type",
    "property_type",
    "source_types",      # Domain node types
    "data_type",         # For node_property
    "target_types",      # For edge
    "domain_uris",
    "range_uris",
    "inverse_of_uri",
    F.coalesce(F.col("is_functional"), F.lit(False)).alias("is_functional"),
    "source_graphs"      # All source files where this property was defined (provenance)
)

print(f"\nTotal unique properties: {df_output.count()}")

In [None]:
# Show property type distribution
print("\nProperty Mapping Distribution:")
df_output.groupBy("mapping_type").count().orderBy(F.desc("count")).show()

In [None]:
# Show node properties (DatatypeProperties)
print("\n" + "=" * 60)
print("NODE PROPERTIES (DatatypeProperties)")
print("=" * 60)

df_output.filter(F.col("mapping_type") == "node_property") \
    .select("property_name", "display_name", "data_type", "source_types") \
    .orderBy("property_name") \
    .show(30, truncate=60)

In [None]:
# Show edge types (ObjectProperties)
print("\n" + "=" * 60)
print("EDGE TYPES (ObjectProperties)")
print("=" * 60)

df_output.filter(F.col("mapping_type") == "edge") \
    .select("property_name", "display_name", "source_types", "target_types") \
    .orderBy("property_name") \
    .show(30, truncate=60)

In [None]:
# Show properties without domain (generic properties)
print("\nProperties without domain (generic):")
df_output.filter(F.col("source_types").isNull()) \
    .select("property_name", "mapping_type", "data_type", "target_types") \
    .show(20, truncate=60)

In [None]:
# Validation
print("\n" + "=" * 60)
print("VALIDATION")
print("=" * 60)

# Check for duplicate property names
df_duplicates = df_output.groupBy("property_name").agg(
    F.count("*").alias("count"),
    F.collect_list("property_uri").alias("uris")
).filter(F.col("count") > 1)

dup_count = df_duplicates.count()
if dup_count > 0:
    print(f"WARNING: {dup_count} duplicate property names:")
    df_duplicates.show(10, truncate=80)
else:
    print("[OK] No duplicate property names")

# Check for unknown mapping types
unknown_count = df_output.filter(F.col("mapping_type") == "unknown").count()
if unknown_count > 0:
    print(f"INFO: {unknown_count} properties with unknown mapping type (need manual review)")
else:
    print("[OK] All properties have determined mapping type")

In [None]:
# Summary statistics
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)

total_props = df_output.count()
node_props = df_output.filter(F.col("mapping_type") == "node_property").count()
edge_types = df_output.filter(F.col("mapping_type") == "edge").count()
with_domain = df_output.filter(F.col("source_types").isNotNull()).count()
with_range = df_output.filter(
    (F.col("data_type") != "string") | (F.col("target_types").isNotNull())
).count()
functional = df_output.filter(F.col("is_functional") == True).count()

print(f"\nTotal properties: {total_props}")
print(f"  - Node properties: {node_props}")
print(f"  - Edge types: {edge_types}")
print(f"\nWith domain (source types): {with_domain} ({100*with_domain//total_props if total_props else 0}%)")
print(f"With range (data/target type): {with_range} ({100*with_range//total_props if total_props else 0}%)")
print(f"Functional properties: {functional}")

# Diagnostic: Properties without domain (R15 - will be skipped in ontology creation)
df_no_domain = df_output.filter(F.col("source_types").isNull())
no_domain_count = df_no_domain.count()
if no_domain_count > 0:
    print(f"\n" + "=" * 60)
    print(f"SKIPPED PROPERTIES REPORT (R15)")
    print(f"=" * 60)
    print(f"\n⚠️ {no_domain_count} properties have no rdfs:domain and will be SKIPPED:")
    print(f"   These properties exist in the ontology but cannot be assigned to node types.")
    print(f"   Future: Use F7.10 UI to manually assign domains, or enable Phase 2 instance inference.\n")
    df_no_domain.select(
        "property_name",
        "display_name", 
        "mapping_type",
        "target_types",
        "property_uri"
    ).orderBy("property_name").show(50, truncate=60)

In [None]:
# Save to Delta table
df_output.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(OUTPUT_TABLE)

print(f"\nSaved {total_props} properties to '{OUTPUT_TABLE}'")

In [None]:
# Export as JSON for frontend/API consumption
properties_json = [
    {
        "property_name": row.property_name,
        "display_name": row.display_name,
        "description": row.description,
        "mapping_type": row.mapping_type,
        "source_types": row.source_types or [],
        "data_type": row.data_type if row.mapping_type == "node_property" else None,
        "target_types": row.target_types or [],
        "is_functional": row.is_functional,
        "property_uri": row.property_uri,
    }
    for row in df_output.collect()
]

print("\nJSON output sample (first 3):")
print(json.dumps(properties_json[:3], indent=2))