In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
# Load ML-ready Gold layer
training_data = spark.table("workspace.ecommerce.ml_training_gold")

In [0]:
# Handle nulls 
training_data = training_data.fillna({
    "total_spent": 0.0,
    "avg_purchase_price": 0.0
})

In [0]:
# Vectorize
assembler = VectorAssembler(
    inputCols=[
        "total_events",
        "purchase_count",
        "total_spent",
        "avg_purchase_price"
    ],
    outputCol="features",
    handleInvalid="skip"
)

data = assembler.transform(training_data).select("features", "purchased")

In [0]:
# Split
train, test = data.randomSplit([0.8, 0.2], seed=42)

In [0]:
# Logistic
lr = LogisticRegression(labelCol="purchased", featuresCol="features")
lr_model = lr.fit(train)
lr_predictions = lr_model.transform(test)

In [0]:
# RandomForest
rf = RandomForestClassifier(
    labelCol="purchased",
    featuresCol="features",
    numTrees=100,
    maxDepth=10
)
rf_model = rf.fit(train)
rf_predictions = rf_model.transform(test)

In [0]:
# Evaluate
evaluator = BinaryClassificationEvaluator(labelCol="purchased")

print("Logistic AUC:", evaluator.evaluate(lr_predictions))
print("RandomForest AUC:", evaluator.evaluate(rf_predictions))

In [0]:
training_data.groupBy("purchased").avg("purchase_count").show()

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "total_events",
        "total_spent",
        "avg_purchase_price"
    ],
    outputCol="features",
    handleInvalid="skip"
)

In [0]:
data = assembler.transform(training_data).select("features", "purchased")
train, test = data.randomSplit([0.8, 0.2], seed=42)

In [0]:
lr = LogisticRegression(labelCol="purchased", featuresCol="features")
lr_model = lr.fit(train)

lr_predictions = lr_model.transform(test)

evaluator = BinaryClassificationEvaluator(labelCol="purchased")
print("Logistic AUC:", evaluator.evaluate(lr_predictions))

In [0]:
rf = RandomForestClassifier(
    labelCol="purchased",
    featuresCol="features",
    numTrees=100,
    maxDepth=10
)

rf_model = rf.fit(train)
rf_predictions = rf_model.transform(test)

print("RF AUC:", evaluator.evaluate(rf_predictions))

In [0]:
rf = RandomForestClassifier(
    labelCol="purchased",
    featuresCol="features",
    numTrees=100,
    maxDepth=10
)

rf_model = rf.fit(train)
rf_predictions = rf_model.transform(test)

print("RF AUC:", evaluator.evaluate(rf_predictions))

In [0]:
train = training_data.filter("last_event_time < '2019-11-01'")
test  = training_data.filter("last_event_time >= '2019-11-01'")

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS my_volume;

In [0]:
spark.sql("SHOW VOLUMES ").show()
#/Volumes/workspace/default/my_volume

In [0]:
#gold_df = spark.read.table("workspace.ecommerce.ml_training_gold").show()

In [0]:
spark.read.table("workspace.ecommerce.ml_training_gold").printSchema()

In [0]:
gold_df = spark.read.table("workspace.ecommerce.ml_training_gold")
gold_df = gold_df.fillna({
    "total_spent": 0.0,
    "avg_purchase_price": 0.0
})

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "total_events",
        "cart_count",
        "view_count",
        "total_spent",
        "avg_purchase_price"
    ],
    outputCol="features",
    handleInvalid="skip"
)

In [0]:
model_data = assembler.transform(gold_df)

final_data = model_data.select("features", "purchased")

In [0]:
train, test = final_data.randomSplit([0.8, 0.2], seed=42)

In [0]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    labelCol="purchased",
    featuresCol="features",
    numTrees=100,
    maxDepth=10
)

rf_model = rf.fit(train)

In [0]:
predictions = rf_model.transform(test)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="purchased")

print("AUC:", evaluator.evaluate(predictions))