**SEACO-5450** : Scoping of APT where addr:suburb and addr:place exists

In [0]:
%scala
//Load latest snapshot

import com.databricks.dbutils_v1.DBUtilsHolder
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import com.tomtom.addressing.bulk.commons.model.LayerVersions
import com.tomtom.orbis.addressing.bulk.commons.spark.SparkHelper
import com.tomtom.addressing.bulk.commons.config.ConfigLoader
import org.apache.sedona.spark.SedonaContext
import com.tomtom.addressing.bulk.scala.load.LoadFreshSnapshotData


val mapper = new ObjectMapper()
    mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
      .configure(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT, true)
      .configure(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY, true)

val versionsBuilder = LayerVersions.builder()

versionsBuilder.layer(14533, null)

val versionMetadata: String = mapper.writeValueAsString(versionsBuilder.build())
DBUtilsHolder.dbutils.widgets.text("layer-versions", versionMetadata)

val env = "prod"
val database = "delete_retriggers"

implicit val sparky = spark
SedonaContext.create(spark)
ConfigLoader.forEnvironment(env)
SparkHelper.init(database)

new LoadFreshSnapshotData().run()

In [0]:
%scala

val toUnsignedLong = udf((signedStr: String) => {
  if (signedStr == null || signedStr.isEmpty) {
    "0"
  } else {
    try {
      val signedLong = signedStr.toLong
      java.lang.Long.toUnsignedString(signedLong)
    } catch {
      case _: NumberFormatException => signedStr // Keep original if not a valid number
    }
  }
})

In [0]:
%scala

import org.apache.spark.sql.functions._
import com.tomtom.orbis.addressing.bulk.commons.repository.OrbisElementRepository


val aptDS = new OrbisElementRepository("14533").readAll
val finalDf = aptDS.withColumn("Product_orbis_id",concat(
    col("id.layerId"),
    lit("_"),
    toUnsignedLong(col("id.high")),
    lit("_"),
    toUnsignedLong(col("id.low"))
  )).withColumn("country", 
  expr("transform(tags, x -> x.tagKey.key)"))
  .withColumn("country", 
    expr("filter(tags, x -> x.tagKey.key = 'metadata:country')[0].value"))
display(finalDf)

In [0]:
%scala
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{Row, SparkSession}
import com.tomtom.orbis.io.spark.model.Tag
import scala.util.control.Breaks._

// Define the UDF
val hasSuburbAndPlace = udf((tags: Seq[Tag]) => {
  tags.exists(tag => tag.tagKey.key.contains("addr:suburb")) &&
  tags.exists(tag => tag.tagKey.key.contains("addr:place"))
})

// Apply the UDF to filter the DataFrame
val filteredDF = finalDf.filter(col("country") === "DEU").filter(hasSuburbAndPlace(col("tags")))

display(filteredDF)

In [0]:
%scala

print(s"APT count with suburb and place: ${filteredDF.count}")//1507

In [0]:
%scala
import org.apache.spark.sql.types._

val dataCorrectnessDataset = filteredDF.select(
  col("Product_orbis_id"),
  lit("").cast(StringType).as("Target_Value"),
  lit("addr:place").as("Product_Address_component"),
  lit("de-Latn").as("Product_Language"),
  lit("").cast(StringType).as("Source_Feature_Id"),
  lit(0).as("Orbis_X"),
  lit(0).as("Orbis_Y")
)

display(dataCorrectnessDataset)

In [0]:
%scala

val outputPath = "dbfs:/mnt/source-precheck/deu-data-correctness/missing-input-after-si-1/" // Specify your output path

dataCorrectnessDataset.coalesce(1) // Reduce the number of partitions to 1
  .write
  .format("csv")
  .option("header", "true") // Include header in the CSV
  .mode("overwrite") // Overwrite the output folder if it exists
  .save(outputPath)

In [0]:
%scala
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
// Backup 

val outputPath = "dbfs:/mnt/source-precheck/deu-data-correctness/missing-backup-after-si-1/" // Specify your output path

filteredDF.select(
  col("revisionId"),
  col("elementType"),
  to_json(col("tags")).as("tags"), // Convert tags to JSON string
  col("lat"),
  col("lng"),
  to_json(col("nodes")).as("nodes"),
  col("wkt"),
  to_json(col("members")).as("members"),
  concat_ws(",", col("semanticIds")).as("semanticIds"),
  col("Product_orbis_id"),
  col("country")
).coalesce(1) // Reduce the number of partitions to 1
  .write
  .format("csv")
  .option("header", "true") // Include header in the CSV
  .mode("overwrite") // Overwrite the output folder if it exists
  .save(outputPath)