# 01 - RDF Parser (Apache Jena)

Parses Turtle (.ttl) RDF files into bronze layer Delta table using Apache Jena.

**Prerequisites**: Attach Fabric Environment `env_rdf_jena` with `jena-shaded-4.10.0.jar`  
(Built from `tools/jena-shaded/` - single uber JAR with relocated dependencies)

**Input**: TTL files from lakehouse Files (shortcuts or uploaded)  
**Output**: Delta table `bronze_triples`

| Column | Type | Description |
|--------|------|-------------|
| subject | String | Subject URI or blank node |
| predicate | String | Predicate URI |
| object | String | Object value (URI, blank node, or literal) |
| object_type | String | 'uri', 'bnode', or 'literal' |
| datatype | String | XSD datatype for literals (nullable) |
| lang | String | Language tag for literals (nullable) |
| graph | String | Source graph/file name |

In [None]:
// Configuration
val lakehousePath = "/lakehouse/default/Files"

// Input folders - load BOTH schema (normative) AND instance data (examples)
// Schema files contain class definitions, descriptions, labels
// Example files contain instance data
val inputFolders = Seq(
  "normative_nen2660",   // Schema with descriptions (nen2660-term.ttl, nen2660-rdfs.ttl, etc.)
  "examples_nen2660"     // Instance data (IJsselbrug.ttl, etc.)
)

val outputTable = "bronze_triples"

println(s"Will parse TTL files from: ${inputFolders.mkString(", ")}")

In [None]:
import org.apache.jena.riot.RDFDataMgr
import org.apache.jena.riot.Lang
import org.apache.jena.graph.{Node, Triple}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types._
import scala.collection.JavaConverters._
import java.io.File

// Triple schema for bronze layer
val tripleSchema = StructType(Seq(
  StructField("subject", StringType, nullable = false),
  StructField("predicate", StringType, nullable = false),
  StructField("object", StringType, nullable = false),
  StructField("object_type", StringType, nullable = false),
  StructField("datatype", StringType, nullable = true),
  StructField("lang", StringType, nullable = true),
  StructField("graph", StringType, nullable = false)
))

In [None]:
/**
 * Extract string representation and metadata from a Jena Node
 */
def extractNode(node: Node): (String, String, Option[String], Option[String]) = {
  if (node.isURI) {
    (node.getURI, "uri", None, None)
  } else if (node.isBlank) {
    (s"_:${node.getBlankNodeLabel}", "bnode", None, None)
  } else if (node.isLiteral) {
    val value = node.getLiteralLexicalForm
    val datatype = Option(node.getLiteralDatatypeURI)
    val lang = Option(node.getLiteralLanguage).filter(_.nonEmpty)
    (value, "literal", datatype, lang)
  } else {
    (node.toString, "unknown", None, None)
  }
}

/**
 * Parse a TTL file using Apache Jena and return triples as Rows
 */
def parseTtlFile(filePath: String, graphName: String): Seq[Row] = {
  val model = RDFDataMgr.loadModel(filePath, Lang.TURTLE)
  val graph = model.getGraph
  
  graph.find().asScala.map { triple =>
    val (subj, _, _, _) = extractNode(triple.getSubject)
    val pred = triple.getPredicate.getURI
    val (obj, objType, datatype, lang) = extractNode(triple.getObject)
    
    Row(subj, pred, obj, objType, datatype.orNull, lang.orNull, graphName)
  }.toSeq
}

In [None]:
// Discover TTL files in all input folders
def findTtlFiles(dir: File): Seq[File] = {
  if (dir.isDirectory) {
    dir.listFiles.flatMap { f =>
      if (f.isDirectory) findTtlFiles(f)
      else if (f.getName.endsWith(".ttl")) Seq(f)
      else Seq.empty
    }.toSeq
  } else Seq.empty
}

// Collect files from all input folders
val ttlFiles = inputFolders.flatMap { folder =>
  val inputPath = s"$lakehousePath/$folder"
  val inputDir = new File(inputPath)
  println(s"\nScanning folder: $folder")
  val files = findTtlFiles(inputDir)
  println(s"  Found ${files.length} TTL files")
  files.foreach(f => println(s"    - ${f.getName}"))
  files
}

println(s"\n=== Total: ${ttlFiles.length} TTL files from ${inputFolders.length} folders ===")

In [None]:
// Parse all TTL files
val allTriples = ttlFiles.flatMap { file =>
  val graphName = file.getName.stripSuffix(".ttl")
  println(s"Parsing: ${file.getName}")
  val triples = parseTtlFile(file.getAbsolutePath, graphName)
  println(s"  -> ${triples.length} triples")
  triples
}

println(s"\nTotal triples: ${allTriples.length}")

In [None]:
// Create DataFrame and show sample
val dfTriples = spark.createDataFrame(
  spark.sparkContext.parallelize(allTriples),
  tripleSchema
)

dfTriples.show(10, truncate = 50)

In [None]:
// Write to Delta table
dfTriples.write
  .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .saveAsTable(outputTable)

println(s"Saved ${dfTriples.count()} triples to table '$outputTable'")

In [None]:
// Verification: show stats per graph
spark.sql(s"""
  SELECT graph, 
         COUNT(*) as triple_count,
         COUNT(DISTINCT subject) as unique_subjects,
         COUNT(DISTINCT predicate) as unique_predicates
  FROM $outputTable
  GROUP BY graph
  ORDER BY triple_count DESC
""").show()