In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
input_path = "/Volumes/well_logs_nrt/gold/deltalake_folders_volume/all_wells_gold"
unpivotExpr = "stack(16, 'GR', GR, 'ILD', ILD, 'ILM', ILM, 'CALI', CALI, 'DPHI', DPHI, 'DT', DT, 'DT2', DT2, 'NPHI', NPHI, 'RHOB', RHOB, 'SFLU', SFLU, 'SP', SP, 'TEN', TEN, 'POROSITY', POROSITY, 'Perm', PERM, 'Fluvialfacie', Fluvialfacies, 'NetGross', NetGross) as (log_name, values)"

df = spark.read.format("delta").load(input_path)

df_unpivoted = (
     df.selectExpr("well_id", "DEPTH", unpivotExpr)
)

df_cleaned = (df_unpivoted
              .select("well_id", "DEPTH", "log_name", "values").where((col("values") != -999.25) & col("values").isNotNull())
              .orderBy("well_id", "DEPTH", "log_name", ascending=[True, True, True])
)

#display(df_unpivoted)
#display(df_unpivoted.groupBy("well_id").count())
display(df_cleaned)

In [0]:
# Making the coresponding aggregations 
df_agg = (df_unpivoted
         .groupBy("well_id", "log_name")
         .agg(
              min(when((col("values") != "-999.25") & col("values").isNotNull(), col("values"))).alias("min_value"),
              max(when((col("values") != "-999.25") & col("values").isNotNull(), col("values"))).alias("max_value"),
              avg(when((col("values") != "-999.25") & col("values").isNotNull(), col("values"))).alias("avg_value"),
              stddev(when((col("values") != "-999.25") & col("values").isNotNull(), col("values"))).alias("stddev_value"),
              count("values").alias("total_values_count"),
              count(when((col("values") != "-999.25") & col("values").isNotNull(), col("values"))).alias("valid_values_count"))
         .orderBy("well_id", "log_name", ascending=[True, True])
         .where((col("total_values_count") != 0) & (col("valid_values_count") != 0))
)

# Identify the min and max valid depth from each log
window_spec_min = Window.partitionBy("well_id", "log_name").orderBy(col("DEPTH").asc())
window_spec_max = Window.partitionBy("well_id", "log_name").orderBy(col("DEPTH").desc())

df_min_depth = (
    df_cleaned.withColumn("row_num", row_number().over(window_spec_min))
            .where(col("row_num") == 1)
            .select("well_id", "log_name", col("depth").alias("min_depth_valid"))
)

df_max_depth = (
    df_cleaned.withColumn("row_num", row_number().over(window_spec_max))
            .where(col("row_num") == 1)
            .select("well_id", "log_name", col("depth").alias("max_depth_valid"))
)

# Calculating the difference between depths to find the measurement intervals
df_with_diff = df_cleaned.withColumn(
    "depth_diff",
    col("DEPTH") - lag("DEPTH", 1).over(window_spec_min)
)

# Calculating the sum between all measurement intervals to find the total valid interval
df_valid_depth = (df_with_diff
             .groupBy("well_id", "log_name")
             .agg(
                 sum("depth_diff").alias("valid_interval_meters")
             )
)

# Joining everything to make the final table
df_final = (df_min_depth.join(df_max_depth, on=["well_id", "log_name"], how="inner")
            .join(df_valid_depth, on=["well_id", "log_name"], how="inner")    
            .join(df_agg, on=["well_id", "log_name"], how="inner")
)

display(df_final)

In [0]:
df_gr_zonation = (df
                  .select("well_id", "DEPTH", "GR")
                  .withColumn("zone_type", when(col("GR") <= 20, "sand")
                                           .when((col("GR") > 20) & (col("GR") <=60), "shaly Sand")
                                           .otherwise("shale"))
                  .orderBy("well_id", "DEPTH", ascending=[True, True])
)

window_spec_min = Window.partitionBy("well_id", "zone_type").orderBy(col("DEPTH").asc())

display(df_gr_zonation)

In [0]:
df_min_depth = (
    df_gr_zonation.withColumn("row_num", row_number().over(window_spec_min))
            .where(col("row_num") == 1)
            .select("well_id", "DEPTH", col("depth").alias("min_depth_valid"))
)
#display(df_gr_zonation.groupBy("zone_type").count())