In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
gold_df = spark.read.table("workspace.ecommerce.ml_training_gold")

In [0]:
#creating labels
from pyspark.sql.functions import when

labeled_df = gold_df.withColumn(
    "purchased",
    when(gold_df.purchase_count > 0, 1).otherwise(0)
)

In [0]:
labeled_df = labeled_df.fillna({
    "total_spent": 0.0,
    "avg_purchase_price": 0.0
})

In [0]:
train = labeled_df.filter("first_event_time < '2019-11-01'")
test  = labeled_df.filter("first_event_time >= '2019-11-01'")

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "total_events",
        "cart_count",
        "view_count",
        "total_spent",
        "avg_purchase_price"
    ],
    outputCol="features",
    handleInvalid="skip"
)

In [0]:
train_data = assembler.transform(train).select("features", "purchased")
test_data  = assembler.transform(test).select("features", "purchased")

In [0]:
evaluator = BinaryClassificationEvaluator(labelCol="purchased")

# Baseline LR (default params)
lr_baseline = LogisticRegression(
    labelCol="purchased",
    featuresCol="features"
)

lr_model = lr_baseline.fit(train_data)

lr_predictions = lr_model.transform(test_data)

lr_auc = evaluator.evaluate(lr_predictions)

print("Baseline Logistic Regression AUC:", lr_auc)

In [0]:
evaluator = BinaryClassificationEvaluator(labelCol="purchased")

lr = LogisticRegression(labelCol="purchased", featuresCol="features")
lr_model = lr.fit(train_data)

lr_predictions = lr_model.transform(test_data)
print("Logistic AUC:", evaluator.evaluate(lr_predictions))

In [0]:
# Baseline RF
rf_baseline = RandomForestClassifier(
    labelCol="purchased",
    featuresCol="features"
)

rf_model = rf_baseline.fit(train_data)

rf_predictions = rf_model.transform(test_data)

rf_auc = evaluator.evaluate(rf_predictions)

print("Baseline Random Forest AUC:", rf_auc)

In [0]:
rf = RandomForestClassifier(
    labelCol="purchased",
    featuresCol="features",
    numTrees=100,
    maxDepth=10
)

rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

print("RF AUC:", evaluator.evaluate(rf_predictions))

In [0]:
for trees in [50, 100]:
    for depth in [5, 10]:

        rf = RandomForestClassifier(
            labelCol="purchased",
            featuresCol="features",
            numTrees=trees,
            maxDepth=depth
        )

        model = rf.fit(train_data)
        predictions = model.transform(test_data)

        auc = evaluator.evaluate(predictions)
        print(f"RF -> Trees: {trees}, Depth: {depth}, AUC: {auc}")

In [0]:
#LR grid search

for reg in [0.01, 0.1, 1.0]:
    for elastic in [0.0, 0.5, 1.0]:

        lr = LogisticRegression(
            labelCol="purchased",
            featuresCol="features",
            regParam=reg,
            elasticNetParam=elastic
        )

        model = lr.fit(train_data)
        predictions = model.transform(test_data)

        auc = evaluator.evaluate(predictions)
        print(f"LR -> Reg: {reg}, Elastic: {elastic}, AUC: {auc}")