# 02 - Schema Richness Detector

Analyzes RDF data to detect schema richness level (0-4) for adaptive guidance.

**Input**: `bronze_triples` Delta table (from 01_rdf_parser_jena)
**Output**: Schema level, confidence, detected constructs, and recommendations

## Schema Levels

| Level | Name | Key Indicators |
|-------|------|----------------|
| 0 | No Schema | Only rdf:type, no class/property definitions |
| 1 | SKOS Terms | skos:Concept, skos:prefLabel, skos:broader |
| 2 | RDFS Classes | rdfs:Class, rdfs:subClassOf, rdfs:domain, rdfs:range |
| 3 | OWL Ontology | owl:Class, owl:ObjectProperty, owl:DatatypeProperty |
| 4 | SHACL Shapes | sh:NodeShape, sh:PropertyShape, sh:path |

Detection is progressive: Level 4 includes all lower-level constructs.

## Additional Analysis

- **owl:imports detection** - Identifies referenced ontologies that could be loaded
- **Graph classification** - Categorizes each graph as `schema`, `instance`, or `mixed`
- **Recommendations** - Suggests loading informative/normative schemas when beneficial

In [None]:
# Configuration
INPUT_TABLE = "bronze_triples"
OUTPUT_TABLE = "bronze_schema_analysis"  # Optional: persist results

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from dataclasses import dataclass
from typing import List, Dict, Optional
import json

# Schema construct indicators by level
SCHEMA_INDICATORS = {
    # Level 4: SHACL (highest priority - check first)
    4: {
        "name": "SHACL Shapes",
        "predicates": [
            "http://www.w3.org/ns/shacl#NodeShape",
            "http://www.w3.org/ns/shacl#PropertyShape",
            "http://www.w3.org/ns/shacl#path",
            "http://www.w3.org/ns/shacl#targetClass",
            "http://www.w3.org/ns/shacl#property",
            "http://www.w3.org/ns/shacl#datatype",
            "http://www.w3.org/ns/shacl#minCount",
            "http://www.w3.org/ns/shacl#maxCount",
        ],
        "types": [
            "http://www.w3.org/ns/shacl#NodeShape",
            "http://www.w3.org/ns/shacl#PropertyShape",
        ]
    },
    # Level 3: OWL
    3: {
        "name": "OWL Ontology",
        "predicates": [
            "http://www.w3.org/2002/07/owl#equivalentClass",
            "http://www.w3.org/2002/07/owl#disjointWith",
            "http://www.w3.org/2002/07/owl#inverseOf",
            "http://www.w3.org/2002/07/owl#unionOf",
            "http://www.w3.org/2002/07/owl#intersectionOf",
            "http://www.w3.org/2002/07/owl#onProperty",
            "http://www.w3.org/2002/07/owl#someValuesFrom",
            "http://www.w3.org/2002/07/owl#allValuesFrom",
            "http://www.w3.org/2002/07/owl#cardinality",
        ],
        "types": [
            "http://www.w3.org/2002/07/owl#Class",
            "http://www.w3.org/2002/07/owl#ObjectProperty",
            "http://www.w3.org/2002/07/owl#DatatypeProperty",
            "http://www.w3.org/2002/07/owl#AnnotationProperty",
            "http://www.w3.org/2002/07/owl#Ontology",
            "http://www.w3.org/2002/07/owl#Restriction",
        ]
    },
    # Level 2: RDFS
    2: {
        "name": "RDFS Schema",
        "predicates": [
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            "http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
            "http://www.w3.org/2000/01/rdf-schema#domain",
            "http://www.w3.org/2000/01/rdf-schema#range",
            "http://www.w3.org/2000/01/rdf-schema#label",
            "http://www.w3.org/2000/01/rdf-schema#comment",
        ],
        "types": [
            "http://www.w3.org/2000/01/rdf-schema#Class",
            "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property",
        ]
    },
    # Level 1: SKOS
    1: {
        "name": "SKOS Vocabulary",
        "predicates": [
            "http://www.w3.org/2004/02/skos/core#prefLabel",
            "http://www.w3.org/2004/02/skos/core#altLabel",
            "http://www.w3.org/2004/02/skos/core#definition",
            "http://www.w3.org/2004/02/skos/core#broader",
            "http://www.w3.org/2004/02/skos/core#narrower",
            "http://www.w3.org/2004/02/skos/core#related",
            "http://www.w3.org/2004/02/skos/core#inScheme",
        ],
        "types": [
            "http://www.w3.org/2004/02/skos/core#Concept",
            "http://www.w3.org/2004/02/skos/core#ConceptScheme",
            "http://www.w3.org/2004/02/skos/core#Collection",
        ]
    },
    # Level 0: Instance data only
    0: {
        "name": "Instance Data Only",
        "predicates": [
            "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
        ],
        "types": []  # Any types that aren't schema types
    }
}

# Common namespace prefixes for display
PREFIXES = {
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf:",
    "http://www.w3.org/2000/01/rdf-schema#": "rdfs:",
    "http://www.w3.org/2002/07/owl#": "owl:",
    "http://www.w3.org/2004/02/skos/core#": "skos:",
    "http://www.w3.org/ns/shacl#": "sh:",
    "http://www.w3.org/2001/XMLSchema#": "xsd:",
}

def shorten_uri(uri: str) -> str:
    """Convert full URI to prefixed form for display."""
    for full, prefix in PREFIXES.items():
        if uri.startswith(full):
            return prefix + uri[len(full):]
    return uri

In [None]:
@dataclass
class SchemaAnalysisResult:
    """Result of schema richness detection."""
    level: int
    level_name: str
    confidence: str  # "low", "medium", "high"
    constructs_found: Dict[str, List[str]]  # {level_name: [constructs]}
    triple_count: int
    graphs_analyzed: List[str]
    has_schema: bool
    has_instance_data: bool
    owl_imports: List[str] = None  # Detected owl:imports URIs
    graph_classifications: Dict[str, str] = None  # {graph: "schema"|"instance"|"mixed"}
    recommendations: List[str] = None  # Suggestions for improvement
    
    def __post_init__(self):
        if self.owl_imports is None:
            self.owl_imports = []
        if self.graph_classifications is None:
            self.graph_classifications = {}
        if self.recommendations is None:
            self.recommendations = []
    
    def to_dict(self) -> dict:
        return {
            "level": self.level,
            "level_name": self.level_name,
            "confidence": self.confidence,
            "constructs_found": self.constructs_found,
            "triple_count": self.triple_count,
            "graphs_analyzed": self.graphs_analyzed,
            "has_schema": self.has_schema,
            "has_instance_data": self.has_instance_data,
            "owl_imports": self.owl_imports,
            "graph_classifications": self.graph_classifications,
            "recommendations": self.recommendations,
        }
    
    def to_json(self) -> str:
        return json.dumps(self.to_dict(), indent=2)


def classify_graph(df_graph) -> str:
    """Classify a graph as schema-heavy, instance-heavy, or mixed."""
    rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
    
    # Get types in this graph
    types = set(
        row.object for row in 
        df_graph.filter(F.col("predicate") == rdf_type)
               .select("object")
               .distinct()
               .collect()
    )
    
    # Schema types from all levels
    schema_types = set()
    for level in range(1, 5):
        schema_types.update(SCHEMA_INDICATORS[level]["types"])
    
    # Schema predicates
    schema_predicates = set()
    for level in range(1, 5):
        schema_predicates.update(SCHEMA_INDICATORS[level]["predicates"])
    
    predicates = set(row.predicate for row in df_graph.select("predicate").distinct().collect())
    
    has_schema_types = bool(types & schema_types)
    has_schema_predicates = bool(predicates & schema_predicates)
    has_instance_types = bool(types - schema_types)
    
    if has_schema_types or has_schema_predicates:
        if has_instance_types:
            return "mixed"
        return "schema"
    return "instance"


def detect_owl_imports(df_triples) -> List[str]:
    """Find owl:imports declarations pointing to external schemas."""
    owl_imports_uri = "http://www.w3.org/2002/07/owl#imports"
    
    imports = [
        row.object for row in 
        df_triples.filter(F.col("predicate") == owl_imports_uri)
                  .select("object")
                  .distinct()
                  .collect()
    ]
    return imports


def generate_recommendations(
    level: int, 
    has_schema: bool, 
    has_instance_data: bool,
    owl_imports: List[str],
    graph_classifications: Dict[str, str]
) -> List[str]:
    """Generate recommendations based on schema analysis."""
    recommendations = []
    
    # Count graph types
    schema_graphs = sum(1 for c in graph_classifications.values() if c == "schema")
    instance_graphs = sum(1 for c in graph_classifications.values() if c == "instance")
    
    # Recommendation: Load referenced ontologies
    if owl_imports:
        recommendations.append(
            f"Found {len(owl_imports)} owl:imports reference(s). Consider loading these "
            f"ontologies for complete schema information: {', '.join(shorten_uri(u) for u in owl_imports[:3])}"
            + ("..." if len(owl_imports) > 3 else "")
        )
    
    # Recommendation: Load informative schemas if only instance data
    if has_instance_data and not has_schema:
        recommendations.append(
            "Data contains only instance triples (level 0). Consider loading the associated "
            "ontology/schema files (e.g., from 'informative' or 'normative' folders) for "
            "richer type and property definitions."
        )
    
    # Recommendation: Low schema level with instance data
    if level == 1 and has_instance_data:
        recommendations.append(
            "SKOS vocabulary detected. If your data uses domain classes, consider loading "
            "the RDFS/OWL ontology for class hierarchy information."
        )
    
    # Recommendation: Instance-only graphs detected
    if instance_graphs > 0 and schema_graphs > 0:
        recommendations.append(
            f"Detected {instance_graphs} instance-only graph(s) and {schema_graphs} schema graph(s). "
            "Your data is well-structured with separate schema and instance data."
        )
    elif instance_graphs > 0 and schema_graphs == 0:
        recommendations.append(
            f"All {instance_graphs} graph(s) contain only instance data. The associated schema/ontology "
            "may not be loaded. Check for 'informative' or 'normative' TTL files."
        )
    
    # Recommendation: Consider SHACL for validation
    if level >= 2 and level < 4:
        recommendations.append(
            f"Schema level {level} ({SCHEMA_INDICATORS[level]['name']}) detected. "
            "If SHACL shapes are available, loading them enables data validation."
        )
    
    return recommendations


def detect_schema_level(df_triples, graphs: Optional[List[str]] = None) -> SchemaAnalysisResult:
    """
    Detect schema richness level from triples DataFrame.
    
    Args:
        df_triples: DataFrame with columns (subject, predicate, object, object_type, graph)
        graphs: Optional list of graph names to analyze (None = all)
    
    Returns:
        SchemaAnalysisResult with level, confidence, and found constructs
    """
    # Filter by graphs if specified
    if graphs:
        df = df_triples.filter(F.col("graph").isin(graphs))
    else:
        df = df_triples
    
    # Cache for multiple operations
    df.cache()
    
    triple_count = df.count()
    graphs_analyzed = [row.graph for row in df.select("graph").distinct().collect()]
    
    # Get all unique predicates and types
    predicates = set(row.predicate for row in df.select("predicate").distinct().collect())
    
    # Get types (object values where predicate is rdf:type)
    rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
    types = set(
        row.object for row in 
        df.filter(F.col("predicate") == rdf_type)
          .select("object")
          .distinct()
          .collect()
    )
    
    # Detect owl:imports
    owl_imports = detect_owl_imports(df)
    
    # Classify each graph
    graph_classifications = {}
    for graph in graphs_analyzed:
        df_graph = df.filter(F.col("graph") == graph)
        graph_classifications[graph] = classify_graph(df_graph)
    
    # Detect constructs at each level (highest to lowest)
    constructs_found = {}
    detected_level = 0
    
    for level in [4, 3, 2, 1, 0]:
        indicators = SCHEMA_INDICATORS[level]
        level_name = indicators["name"]
        
        # Find matching predicates
        found_predicates = [
            shorten_uri(p) for p in indicators["predicates"] 
            if p in predicates
        ]
        
        # Find matching types
        found_types = [
            shorten_uri(t) for t in indicators["types"]
            if t in types
        ]
        
        found_all = found_predicates + found_types
        
        if found_all:
            constructs_found[level_name] = found_all
            if level > detected_level:
                detected_level = level
    
    # Determine confidence based on how many indicators found
    level_indicators = SCHEMA_INDICATORS[detected_level]
    expected_count = len(level_indicators["predicates"]) + len(level_indicators["types"])
    found_count = len(constructs_found.get(level_indicators["name"], []))
    
    if expected_count == 0:
        confidence = "high"  # Level 0 is always high confidence
    elif found_count >= expected_count * 0.5:
        confidence = "high"
    elif found_count >= expected_count * 0.25:
        confidence = "medium"
    else:
        confidence = "low"
    
    # Determine if we have schema vs instance data
    has_schema = detected_level > 0
    
    # Check for instance data (subjects that are typed but not schema definitions)
    schema_types = set()
    for level in range(1, 5):
        schema_types.update(SCHEMA_INDICATORS[level]["types"])
    
    instance_types = types - schema_types
    has_instance_data = len(instance_types) > 0
    
    # Generate recommendations
    recommendations = generate_recommendations(
        detected_level, has_schema, has_instance_data, owl_imports, graph_classifications
    )
    
    df.unpersist()
    
    return SchemaAnalysisResult(
        level=detected_level,
        level_name=SCHEMA_INDICATORS[detected_level]["name"],
        confidence=confidence,
        constructs_found=constructs_found,
        triple_count=triple_count,
        graphs_analyzed=graphs_analyzed,
        has_schema=has_schema,
        has_instance_data=has_instance_data,
        owl_imports=owl_imports,
        graph_classifications=graph_classifications,
        recommendations=recommendations,
    )

In [None]:
# Load bronze triples
df_triples = spark.table(INPUT_TABLE)
print(f"Loaded {df_triples.count()} triples from '{INPUT_TABLE}'")
print(f"Graphs available: {[r.graph for r in df_triples.select('graph').distinct().collect()]}")

In [None]:
# Analyze all graphs combined
result = detect_schema_level(df_triples)

print("=" * 60)
print("SCHEMA RICHNESS ANALYSIS")
print("=" * 60)
print(f"\nDetected Level: {result.level} - {result.level_name}")
print(f"Confidence: {result.confidence}")
print(f"Total Triples: {result.triple_count}")
print(f"Graphs Analyzed: {result.graphs_analyzed}")
print(f"Has Schema: {result.has_schema}")
print(f"Has Instance Data: {result.has_instance_data}")
print(f"\nConstructs Found:")
for level_name, constructs in result.constructs_found.items():
    print(f"  {level_name}: {', '.join(constructs)}")

# Show owl:imports if any
if result.owl_imports:
    print(f"\nüì¶ owl:imports References ({len(result.owl_imports)}):")
    for imp in result.owl_imports:
        print(f"  - {shorten_uri(imp)}")

# Show graph classifications
print(f"\nüìä Graph Classifications:")
for graph, classification in result.graph_classifications.items():
    icon = {"schema": "üìê", "instance": "üìù", "mixed": "üîÄ"}.get(classification, "‚ùì")
    print(f"  {icon} {graph}: {classification}")

In [None]:
# Analyze each graph individually
print("\n" + "=" * 60)
print("PER-GRAPH ANALYSIS")
print("=" * 60)

graphs = [r.graph for r in df_triples.select("graph").distinct().collect()]
graph_results = []

for graph in graphs:
    result = detect_schema_level(df_triples, graphs=[graph])
    graph_results.append(result)
    print(f"\n{graph}:")
    print(f"  Level: {result.level} ({result.level_name})")
    print(f"  Confidence: {result.confidence}")
    print(f"  Triples: {result.triple_count}")

In [None]:
# Display recommendations based on analysis
result_all = detect_schema_level(df_triples)

print("=" * 60)
print("RECOMMENDATIONS")
print("=" * 60)

if result_all.recommendations:
    for i, rec in enumerate(result_all.recommendations, 1):
        print(f"\n{i}. {rec}")
else:
    print("\n[OK] No recommendations - schema analysis looks complete!")

# Highlight if informative schemas might help
instance_only_graphs = [g for g, c in result_all.graph_classifications.items() if c == "instance"]
if instance_only_graphs and not result_all.has_schema:
    print("\n" + "-" * 60)
    print("TIP: Your data appears to be instance-only. If you have access to")
    print("informative or normative ontology files (often in separate folders),")
    print("loading them will provide:")
    print("  - Class hierarchies (rdfs:subClassOf)")
    print("  - Property domains and ranges")
    print("  - Labels and descriptions for types")
    print("  - SHACL shapes for validation (if available)")

In [None]:
# Show predicate distribution for understanding the data
print("\n" + "=" * 60)
print("PREDICATE DISTRIBUTION")
print("=" * 60)

df_triples.groupBy("predicate") \
    .count() \
    .orderBy(F.desc("count")) \
    .show(30, truncate=80)

In [None]:
# Show type distribution (rdf:type objects)
print("\n" + "=" * 60)
print("TYPE DISTRIBUTION")
print("=" * 60)

rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
df_triples.filter(F.col("predicate") == rdf_type) \
    .groupBy("object") \
    .count() \
    .orderBy(F.desc("count")) \
    .show(30, truncate=80)

In [None]:
# Export result to JSON (for frontend consumption)
result_combined = detect_schema_level(df_triples)
result_json = result_combined.to_json()
print("\nJSON Output (for frontend):")
print(result_json)

In [None]:
# Optional: Save results to Delta table for pipeline use
from pyspark.sql import Row
from datetime import datetime

result_row = Row(
    analysis_timestamp=datetime.now().isoformat(),
    level=result_combined.level,
    level_name=result_combined.level_name,
    confidence=result_combined.confidence,
    triple_count=result_combined.triple_count,
    graphs_analyzed=result_combined.graphs_analyzed,
    has_schema=result_combined.has_schema,
    has_instance_data=result_combined.has_instance_data,
    constructs_json=json.dumps(result_combined.constructs_found)
)

df_result = spark.createDataFrame([result_row])

df_result.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(OUTPUT_TABLE)

print(f"\nResults saved to '{OUTPUT_TABLE}'")