#### Data cleaning and validations (silver to gold)

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
import dlt

In [0]:
@dlt.expect("has_well_id", "well_id IS NOT NULL")
@dlt.expect_or_drop("valid_depth", "DEPTH >= 0")
#@dlt.expect_or_drop("no_duplicates", "ROW_NUMBER() OVER (PARTITION BY measurement_time ORDER BY well_id) = 1")
@dlt.table(
    name = "all_wells_gold",
    comment = "Table that contains all wells in gold layer",
    partition_cols = ["well_id"],
    table_properties = {
        #"delta.partitionColumns": "well_id",
        "layer" : "gold", 
        "type" : "well log"
        }
)

def incremental_to_gold():
  df = dlt.read_stream("all_wells_silver")

  # Data cleaning
  
  df_cleaned = (df.filter(col("well_id").isNotNull()) 
              # .filter(col("DEPTH") >= 0)
              .dropDuplicates(["well_id", "measurement_time"])
              .withColumn("gold_ingestion_time", current_timestamp())
              .drop(col("silver_ingestion_time"), col("_rescued_data"))
  )
  
  #Data validations
  return (
      df_cleaned.withColumn(
      "quality_check",
      when((col("DEPTH") >= 0), "valid")
      .otherwise("Not passed")
      )
  )