In [0]:
#Create Binary Purchase Label
from pyspark.sql import functions as F

events = spark.table("workspace.ecommerce.events_delta")

label_df = events.groupBy("user_id") \
    .agg(
        F.max(
            F.when(F.col("event_type") == "purchase", 1).otherwise(0)
        ).alias("purchased")
    )

In [0]:
features_df = spark.table("workspace.ecommerce.user_features_silver")
display(features_df)


In [0]:
##Join with Feature Table
training_data = features_df.join(label_df, "user_id", "inner")


In [0]:
#Train/Test Split
train, test = training_data.randomSplit([0.8, 0.2], seed=42)

In [0]:
#Validate Distribution
training_data.groupBy("purchased").count().show()
#Check training and test data Percentage
training_data.groupBy("purchased").count().withColumn("Percentage", col("count")/sum("count").over()).show()
test.groupBy("purchased") \
    .count() \
    .withColumn("percentage",
        F.col("count") / test.count()
    ).show()

In [0]:
#optionally saving final Training Dataset
training_data.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.ml_training_gold")

---------Validations-----------------

In [0]:
events.select("user_id").distinct().count()
label_df.count()

In [0]:
events.filter("event_type = 'purchase'") \
      .select("user_id") \
      .distinct() \
      .count()
label_df.filter("purchased = 1").count()

In [0]:
features_df.count()
label_df.count()
training_data.count()

In [0]:
training_data.filter("purchased IS NULL").count()

In [0]:
features_df.select("user_id").distinct().count()
label_df.select("user_id").distinct().count()
label_df.groupBy("purchased").count().show()
training_data.filter("purchased IS NULL").count()

In [0]:
training_data.groupBy("purchased") \
    .avg("total_events", "purchase_count") \
    .show()

In [0]:
training_data.printSchema()
train.count()   
test.count()   
training_data.count()

In [0]:
training_data.show(10, truncate=False)

In [0]:
train.groupBy("purchased").count().show()
test.groupBy("purchased").count().show()

In [0]:
spark.sql("DESCRIBE HISTORY workspace.ecommerce.ml_training_gold").show()