#### Raneme and dedup columns (bronze to silver)

In [0]:
from pyspark.sql.functions import *
import dlt

In [0]:
#@dlt.expect_or_drop("no_duplicates", "ROW_NUMBER() OVER (PARTITION BY measurement_time ORDER BY well_id) = 1")
@dlt.table(
    name = "all_wells_silver",
    comment = "Table that contains all wells in silver layer",
    table_properties = {"layer" : "silver", "type" : "well log"}
)

def all_wells_silver():
  df = dlt.read_stream("all_wells_bronze")

  ## To rename duplicated columns and merge columns pointing to the same target name (i.e. deduplicate)
  column_map = {
      "DEPT": "DEPTH", "Gamma": "GR",
      "RILD": "ILD", "RILM": "ILM"
  }

  # Merge columns pointing to the same target name
  renamed_cols = []
  already_mapped = set()
  df_cols = df.schema.names
  
  for src_col, dest_col in column_map.items():
      if dest_col not in already_mapped:
          # Find all the columns that maps the destination name
          cols_to_merge = [col(c) for c, d in column_map.items() if d == dest_col and c in df_cols]
          # If the destination name already existed as the original column, we also include it
          if dest_col in df_cols:
              cols_to_merge.append(col(dest_col))
          # Create final column by coalesce (fist value priority)
          if cols_to_merge:
              renamed_cols.append(coalesce(*cols_to_merge).alias(dest_col))
              already_mapped.add(dest_col)
  
  # Add columns unmapped and that in the colisionan
  for c in df_cols:
      if c not in column_map and c not in already_mapped:
          renamed_cols.append(col(c))
  
  # Final DataFrame
  df_dedup = df.select(renamed_cols)
  df_dedup_cols = df_dedup.columns

  # Rename columns conditionally and use the same standard for each log
  column_names = [col("WELL_NAME").alias("well_id")]
  column_names.extend(col(c) for c in df_dedup_cols if c not in ("WELL_NAME","RECORDED_IN"))
  column_names.append(col("RECORDED_IN").alias("measurement_time"))
  
  return (
      df_dedup.select(column_names)
      .withColumn("measurement_time", to_timestamp("measurement_time"))
      .withColumn("silver_ingestion_time", current_timestamp())
      .drop("bronze_ingestion_time", "_rescued_data", "file_mod_time")
  )