In [0]:
%scala

val inputFeatures = spark.read.format("delta").table("preprocess_prod.checks_input_features")
val outputFeatures = spark.read.format("delta").table("preprocess_prod.checks_output_features")
val resultFeatures = spark.read.format("delta").table("preprocess_prod.checks_results_features")

// println(inputFeatures.count)  // 1676346
// println(outputFeatures.count)  // 1676346
// println(resultFeatures.count)  // 5029261   | count should be   5029038  | 223 more records

val resultFeaturesForClustered = resultFeatures.filter(col("checkName") === "CLUSTERED_APTS")
// println(resultFeaturesForClustered.count)   // 1676569   |  223 more records

val duplicateRecords = resultFeaturesForClustered.groupBy(("featureId")).count.filter(col("count") > 1)
// println(duplicateRecords.count)    // 223 duplicate entry

val result = resultFeaturesForClustered.as("cl").join(duplicateRecords.as("du"), Seq("featureId")).select("cl.*").filter(col("isFailed") === "true")


val clusteredApts = spark.read.format("delta").table("preprocess_prod.clustered_apt").filter(col("runId") === "948265873867220")
.filter(col("origin") === "source").filter(col("status") === "SUSPICIOUS")
// println(clusteredApts.count) // 521

val clusteredInResultsFeature = resultFeatures.filter(col("checkName") === "CLUSTERED_APTS").filter(col("isFailed") === "true")
// println(clusteredInResultsFeature.count) // 575

val failedInClusteredCheck = inputFeatures.join(clusteredInResultsFeature, Seq("featureId"))
// .select(col("featureId"), col("apt.location"))
// .groupBy(col("location")).count.filter(col("count") > 1)

display(failedInClusteredCheck)


// println(outputFeatures.filter(col("sos").contains("CLUSTERED_APTS_DETECTED")).count)
// Unexpected

In [0]:
%scala
// val dbName = "preprocess_dev"
val dbName = "preprocess_prod"
val checks_input_features_dataset = spark.sql(s"""
    SELECT *
    FROM 
    ${dbName}.checks_input_features
""")

println(s"checks_input_features_dataset count: ${checks_input_features_dataset.count}")

In [0]:
%scala

val checks_results_features_dataset = spark.sql(s"""
    SELECT *
    FROM 
    ${dbName}.checks_results_features
""")

display(checks_results_features_dataset)

In [0]:
%scala

var grouped_check_names = checks_results_features_dataset.groupBy("checkName").count()

display(grouped_check_names)

In [0]:
%scala
grouped_check_names = checks_results_features_dataset.groupBy("checkName", "isFailed", "comment").count()

display(grouped_check_names)

In [0]:
%scala
grouped_check_names = checks_results_features_dataset.groupBy("checkName", "isFailed").count()

display(grouped_check_names)

In [0]:
%scala
grouped_check_names = checks_results_features_dataset.groupBy("checkName", "isFailed", "comment").count()

display(grouped_check_names)

In [0]:
%scala

val aptInsideAerodromeDS = checks_results_features_dataset
  .filter("checkName == 'APT_INSIDE_AERODROME' AND comment LIKE 'APT located%'")
  // .filter("checkName == 'APT_ON_LAND_FEATURE' AND comment LIKE 'APT located%'")


println(s"aptInsideAerodromeDS: ${aptInsideAerodromeDS.count}")
display(aptInsideAerodromeDS)

In [0]:
%scala
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.functions._

val aerodromeFailedAptDS = checks_input_features_dataset.as("apt")
                .join(aptInsideAerodromeDS.as("aptOnAerodrome"), col("apt.featureId").equalTo(col("aptOnAerodrome.featureId")), "inner")
                .withColumn("aeroddromid", 
  regexp_extract(col("comment"), "APT located (inside aerodrome|on runway): (.+)", 2)
)
                .select(col("apt.featureId"),
                col("apt.apt.location"),
                col("aptOnAerodrome.fixme"),
                col("aptOnAerodrome.comment"),
                col("aeroddromid"),
                col("isFailed"),
                )

display(aerodromeFailedAptDS)
                

In [0]:
%scala
import org.apache.spark.sql.functions.col
import com.tomtom.orbis.io.spark.model.Id
import java.lang.Long

def convertOrbisIdToString(orbisId: Id): String = {
  val COLON_SEPARATOR = ":"
  Seq(orbisId.layerId.getOrElse(19174).toString, 
      Long.toUnsignedString(orbisId.high), 
      Long.toUnsignedString(orbisId.low)).mkString(COLON_SEPARATOR)
}

// UDF to apply the function to the DataFrame
val convertOrbisIdUDF = udf((orbisId: Id) => convertOrbisIdToString(orbisId))

val aerodrome_dataset = spark.sql(s"""
    SELECT *
    FROM 
    preprocess_prod.layer_19174
""")
.withColumn("orbisIdString", convertOrbisIdUDF(col("id")))
// .filter("id.layerId != null")
.select(col("id"),col("orbisIdString"),col("wkt"))

display(aerodrome_dataset)

In [0]:
%scala
import org.apache.spark.sql.functions.col

val failedAPTWithAerodrome = aerodromeFailedAptDS.as("add").join(aerodrome_dataset.as("pdd"), col("add.aeroddromid") === col("pdd.orbisIdString"), "left").select(col("featureId"),
col("location"),
col("comment"),
col("isFailed"),
col("aeroddromid"),
col("pdd.wkt"))

display(failedAPTWithAerodrome)