In [0]:
from pyspark.sql.functions import *

In [0]:
checkpoints_path = "/Volumes/well_logs_nrt/autoloader/checkpoints_volume/all_wells_silver_checkpoints"
output_path = "/Volumes/well_logs_nrt/silver/deltalake_folders_volume/all_wells_silver"
input_path = "/Volumes/well_logs_nrt/bronze/deltalake_folders_volume/all_wells_bronze"

df = spark.readStream.format("delta").load(input_path)

## This part is to rename duplicated columns and merge columns pointing to the same target name (i.e. deduplicate)

column_map = {
    "DEPT": "DEPTH", "Gamma": "GR",
    "RILD": "ILD", "RILM": "ILM"
}

# Merge columns pointing to the same target name
renamed_cols = []
already_mapped = set()
df_cols = df.columns # This is to avoid accessing several times and involve re-reading the scheme. In this case df.columns works because we are working on an interactive notebook, out of DLT, and the schema can be inferred at run time.

for src_col, dest_col in column_map.items():
    if dest_col not in already_mapped:
        # Find all the columns that map to this destination name
        cols_to_merge = [col(c) for c, d in column_map.items() if d == dest_col and c in df_cols]
        # If the destination name already existed as the original column, we also include it
        if dest_col in  df_cols:
            cols_to_merge.append(col(dest_col))
        # Create final column by coalesce (fist value priority)
        if cols_to_merge:
            #This expression utilizes the coalesce function to return the first non-null value from a list of columns. The *cols_to_merge part unpacks a list or tuple of column names or Column objects, passing them as individual arguments to the coalesce function. 
            renamed_cols.append(coalesce(*cols_to_merge).alias(dest_col))
            already_mapped.add(dest_col)

# Add columns unmapped and that in the colisionan
for c in df_cols:
    if c not in column_map and c not in already_mapped:
        renamed_cols.append(col(c))
print(renamed_cols)

# Final DataFrame
df_dedup = df.select(renamed_cols)
df_dedup_cols = df_dedup.columns

# Rename columns conditionally and use the same standard for each log

column_names = [col("WELL_NAME").alias("well_id")]
column_names.extend(col(c) for c in df_dedup_cols if c not in ("WELL_NAME","RECORDED_IN"))
# column_names += [col(c) for c in df_dedup_cols if c not in ("WELL_NAME","RECORDED_IN")]
column_names.append(col("RECORDED_IN").alias("measurement_time"))
df_renamed = (df_dedup.select(column_names)
             .withColumn("measurement_time", to_timestamp("measurement_time"))
             .withColumn("silver_ingestion_time", current_timestamp())
             .drop("bronze_ingestion_time", "_rescued_data", "file_mod_time")
)

df_renamed.writeStream.format("delta") \
  .option("checkpointLocation", checkpoints_path) \
  .option("mergeSchema", "true") \
  .outputMode("append") \
  .trigger(once=True) \
  .start(output_path)

In [0]:
display(spark.read.format("delta").load("/Volumes/well_logs_nrt/silver/deltalake_folders_volume/all_wells_silver")
        .where("well_id = 'F02-01_logs'")
        #.orderBy(col("well_id").asc(), col("depth").asc())
        .orderBy("well_id", "depth", ascending = [True, True])
)