# 03 - Class to Node Type Mapping

Translates OWL/RDFS classes to Fabric Graph node types.

**Input**: `bronze_triples` Delta table
**Output**: `silver_node_types` Delta table

## Translation Rules

| RDF Construct | LPG Equivalent |
|---------------|----------------|
| owl:Class / rdfs:Class | Node Type (label) |
| rdfs:subClassOf | Node Type hierarchy |
| rdfs:label | Display name |

## Node Type Name Generation

- Extract local name from IRI (after last `/` or `#`)
- Remove special characters
- Convert to PascalCase for consistency
- Handle blank nodes with stable ID

In [None]:
# Configuration
INPUT_TABLE = "bronze_triples"
OUTPUT_TABLE = "silver_node_types"

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType, StructType, StructField
from typing import List, Dict, Optional, Set
import re
import json

# RDF namespace constants
RDF_TYPE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
RDFS_CLASS = "http://www.w3.org/2000/01/rdf-schema#Class"
RDFS_SUBCLASS_OF = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
RDFS_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
RDFS_COMMENT = "http://www.w3.org/2000/01/rdf-schema#comment"
OWL_CLASS = "http://www.w3.org/2002/07/owl#Class"
OWL_THING = "http://www.w3.org/2002/07/owl#Thing"
SKOS_PREFLABEL = "http://www.w3.org/2004/02/skos/core#prefLabel"
SKOS_DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"

# Common namespace prefixes for display
PREFIXES = {
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
    "http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
    "http://www.w3.org/2002/07/owl#": "owl:",
    "http://www.w3.org/2004/02/skos/core#": "skos:",
    "http://www.w3.org/ns/shacl#": "sh:",
    "http://www.w3.org/2001/XMLSchema#": "xsd:",
}

def shorten_uri(uri: str) -> str:
    """Convert full URI to prefixed form for display."""
    if uri is None:
        return None
    for full, prefix in PREFIXES.items():
        if uri.startswith(full):
            return prefix + uri[len(full):]
    return uri

In [None]:
def extract_local_name(uri: str) -> str:
    """
    Extract the local name from a URI.
    Examples:
        http://example.org/ontology#PhysicalObject -> PhysicalObject
        http://example.org/ontology/Building -> Building
        _:blank123 -> blank123
    """
    if uri is None:
        return None
    
    # Handle blank nodes
    if uri.startswith("_:"):
        return uri[2:]
    
    # Try hash fragment first
    if "#" in uri:
        return uri.split("#")[-1]
    
    # Fall back to last path segment
    if "/" in uri:
        return uri.split("/")[-1]
    
    return uri


def sanitize_node_type_name(name: str) -> str:
    """
    Convert a name to a valid node type identifier.
    - Remove special characters
    - Convert to PascalCase
    - Ensure starts with letter
    """
    if name is None:
        return None
    
    # Replace non-alphanumeric with space
    cleaned = re.sub(r'[^a-zA-Z0-9]', ' ', name)
    
    # Split into words and capitalize each
    words = cleaned.split()
    pascal_case = ''.join(word.capitalize() for word in words if word)
    
    # Ensure starts with letter
    if pascal_case and not pascal_case[0].isalpha():
        pascal_case = "Node" + pascal_case
    
    return pascal_case if pascal_case else "UnknownType"


# Register UDFs for Spark
extract_local_name_udf = F.udf(extract_local_name, StringType())
sanitize_name_udf = F.udf(sanitize_node_type_name, StringType())

In [None]:
# Load bronze triples
df_triples = spark.table(INPUT_TABLE)
print(f"Loaded {df_triples.count()} triples from '{INPUT_TABLE}'")

In [None]:
# Constants for property ranges/domains
RDFS_DOMAIN = "http://www.w3.org/2000/01/rdf-schema#domain"
RDFS_RANGE = "http://www.w3.org/2000/01/rdf-schema#range"
OWL_OBJECT_PROPERTY = "http://www.w3.org/2002/07/owl#ObjectProperty"

# Step 1: Find EXPLICIT classes (subjects typed as owl:Class or rdfs:Class)
df_explicit_classes = df_triples.filter(
    (F.col("predicate") == RDF_TYPE) & 
    (F.col("object").isin([OWL_CLASS, RDFS_CLASS]))
).select(
    F.col("subject").alias("class_uri"),
    F.col("graph")
).distinct()

print(f"Found {df_explicit_classes.count()} explicit classes (owl:Class or rdfs:Class)")

# Step 2: Find IMPLIED classes from object property ranges
# These are classes referenced as range of owl:ObjectProperty but not explicitly declared
df_object_properties = df_triples.filter(
    (F.col("predicate") == RDF_TYPE) & 
    (F.col("object") == OWL_OBJECT_PROPERTY)
).select(F.col("subject").alias("property_uri")).distinct()

df_range_classes = df_triples.filter(
    F.col("predicate") == RDFS_RANGE
).join(
    df_object_properties, 
    df_triples["subject"] == df_object_properties["property_uri"],
    "inner"
).select(
    F.col("object").alias("class_uri"),
    F.col("graph")
).distinct()

# Filter out datatype URIs (XSD types, etc.)
df_range_classes = df_range_classes.filter(
    ~F.col("class_uri").startswith("http://www.w3.org/2001/XMLSchema#") &
    ~F.col("class_uri").startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") &
    F.col("class_uri").isNotNull()
)

print(f"Found {df_range_classes.count()} classes from object property ranges")

# Step 3: Find classes from domain declarations of object properties
df_domain_classes = df_triples.filter(
    F.col("predicate") == RDFS_DOMAIN
).join(
    df_object_properties, 
    df_triples["subject"] == df_object_properties["property_uri"],
    "inner"
).select(
    F.col("object").alias("class_uri"),
    F.col("graph")
).distinct()

print(f"Found {df_domain_classes.count()} classes from object property domains")

# Step 4: Combine all class sources
df_classes = df_explicit_classes.union(df_range_classes).union(df_domain_classes).distinct()

# Add class_type for compatibility (mark source)
df_classes = df_classes.withColumn("class_type", F.lit(OWL_CLASS))

print(f"\n=== Total unique classes: {df_classes.count()} ===")
df_classes.show(10, truncate=80)

In [None]:
# Get rdfs:subClassOf relationships
df_subclass = df_triples.filter(
    F.col("predicate") == RDFS_SUBCLASS_OF
).select(
    F.col("subject").alias("class_uri"),
    F.col("object").alias("parent_class_uri")
)

# Aggregate all parent classes for each class
df_parents = df_subclass.groupBy("class_uri").agg(
    F.collect_set("parent_class_uri").alias("parent_class_uris")
)

print(f"Found {df_subclass.count()} subClassOf relationships")

In [None]:
# Get rdfs:label for each class (prefer English or no language tag)
df_labels = df_triples.filter(
    (F.col("predicate") == RDFS_LABEL) | (F.col("predicate") == SKOS_PREFLABEL)
).select(
    F.col("subject").alias("class_uri"),
    F.col("object").alias("label"),
    F.col("lang"),
    F.col("predicate").alias("label_predicate")
)

# Rank labels: prefer English, then no language, then any
from pyspark.sql.window import Window

df_labels = df_labels.withColumn(
    "lang_priority",
    F.when(F.col("lang") == "en", 1)
     .when(F.col("lang").isNull(), 2)
     .when(F.col("lang") == "", 2)
     .otherwise(3)
)

window = Window.partitionBy("class_uri").orderBy("lang_priority")
df_best_labels = df_labels.withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") == 1) \
    .select("class_uri", "label", "lang")

print(f"Found {df_best_labels.count()} classes with labels")

In [None]:
# Get rdfs:comment / skos:definition for descriptions
df_comments = df_triples.filter(
    (F.col("predicate") == RDFS_COMMENT) | (F.col("predicate") == SKOS_DEFINITION)
).select(
    F.col("subject").alias("class_uri"),
    F.col("object").alias("description"),
    F.col("lang").alias("desc_lang")
)

# Prefer English descriptions
df_comments = df_comments.withColumn(
    "lang_priority",
    F.when(F.col("desc_lang") == "en", 1)
     .when(F.col("desc_lang").isNull(), 2)
     .otherwise(3)
)

window = Window.partitionBy("class_uri").orderBy("lang_priority")
df_best_descriptions = df_comments.withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") == 1) \
    .select("class_uri", "description")

print(f"Found {df_best_descriptions.count()} classes with descriptions")

In [None]:
# Combine all class information
df_node_types = df_classes \
    .join(df_parents, "class_uri", "left") \
    .join(df_best_labels, "class_uri", "left") \
    .join(df_best_descriptions, "class_uri", "left")

# Generate node type names
df_node_types = df_node_types \
    .withColumn("local_name", extract_local_name_udf(F.col("class_uri"))) \
    .withColumn("node_type", sanitize_name_udf(F.col("local_name"))) \
    .withColumn("display_name", F.coalesce(F.col("label"), F.col("local_name"))) \
    .withColumn("is_blank_node", F.col("class_uri").startswith("_:"))

# Generate parent node type names
def extract_parent_types(parent_uris):
    if parent_uris is None:
        return None
    return [sanitize_node_type_name(extract_local_name(uri)) for uri in parent_uris if uri]

extract_parent_types_udf = F.udf(extract_parent_types, ArrayType(StringType()))

df_node_types = df_node_types.withColumn(
    "parent_types",
    extract_parent_types_udf(F.col("parent_class_uris"))
)

print(f"Generated {df_node_types.count()} node type mappings")

In [None]:
# Select final columns for output
df_output = df_node_types.select(
    "class_uri",
    "node_type",
    "display_name",
    "description",
    "parent_class_uris",
    "parent_types",
    "class_type",
    "is_blank_node",
    "graph"
)

print("\nNode Type Mapping Preview:")
df_output.select("node_type", "display_name", "parent_types", "class_uri") \
    .orderBy("node_type") \
    .show(20, truncate=60)

In [None]:
# Show class hierarchy
print("\nClass Hierarchy (classes with parents):")
df_output.filter(F.size(F.col("parent_types")) > 0) \
    .select("node_type", "parent_types") \
    .orderBy("node_type") \
    .show(30, truncate=80)

In [None]:
# Validate node type names (check for duplicates and invalid names)
print("\nValidation:")

# Check for duplicate node type names (same name from different URIs)
df_duplicates = df_output.groupBy("node_type").agg(
    F.count("*").alias("count"),
    F.collect_list("class_uri").alias("uris")
).filter(F.col("count") > 1)

dup_count = df_duplicates.count()
if dup_count > 0:
    print(f"WARNING: {dup_count} duplicate node type names found:")
    df_duplicates.show(10, truncate=80)
else:
    print("[OK] No duplicate node type names")

# Check for blank nodes
blank_count = df_output.filter(F.col("is_blank_node") == True).count()
if blank_count > 0:
    print(f"INFO: {blank_count} blank node classes found (stable IDs generated)")
else:
    print("[OK] No blank node classes")

In [None]:
# Summary statistics
print("\n" + "=" * 60)
print("CLASS DISCOVERY SUMMARY")
print("=" * 60)

total_types = df_output.count()
with_parents = df_output.filter(F.size(F.col("parent_types")) > 0).count()
with_labels = df_output.filter(F.col("display_name").isNotNull()).count()
with_descriptions = df_output.filter(F.col("description").isNotNull()).count()

print(f"\nTotal node types discovered: {total_types}")
print(f"\nClass Sources:")
print(f"  - Explicit declarations (owl:Class/rdfs:Class): {df_explicit_classes.count()}")
print(f"  - From object property ranges: {df_range_classes.count()}")
print(f"  - From object property domains: {df_domain_classes.count()}")

print(f"\nMetadata Coverage:")
print(f"  - With class hierarchy: {with_parents} ({100*with_parents//total_types if total_types else 0}%)")
print(f"  - With display names: {with_labels} ({100*with_labels//total_types if total_types else 0}%)")
print(f"  - With descriptions: {with_descriptions} ({100*with_descriptions//total_types if total_types else 0}%)")

print(f"\n" + "=" * 60)
print("DATA QUALITY NOTE")
print("=" * 60)
print(f"""
Classes are discovered from multiple sources to ensure completeness:
1. Explicit declarations - classes typed as owl:Class or rdfs:Class
2. Property ranges - classes referenced as targets of object properties
3. Property domains - classes referenced as sources of object properties

This ensures that when relationships are created in the Ontology, 
both source and target entity types exist.
""")

In [None]:
# Save to Delta table
df_output.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(OUTPUT_TABLE)

print(f"\nSaved {total_types} node types to '{OUTPUT_TABLE}'")

In [None]:
# Export as JSON for frontend/API consumption
node_types_json = [
    {
        "node_type": row.node_type,
        "display_name": row.display_name,
        "description": row.description,
        "parent_types": row.parent_types or [],
        "class_uri": row.class_uri,
        "is_blank_node": row.is_blank_node,
    }
    for row in df_output.collect()
]

print("\nJSON output sample (first 3):")
print(json.dumps(node_types_json[:3], indent=2))