In [0]:
import mlflow
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

mlflow.set_experiment("/Workspace/ecommerce_purchase_prediction")

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/workspace/default/my_volume/"

In [0]:
mlflow.spark.log_model(
    model,
    "model",
    dfs_tmpdir="/Volumes/workspace/default/my_volume/"
)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "total_events",
        "cart_count",
        "view_count",
        "total_spent",
        "avg_purchase_price"
    ],
    outputCol="features",
    handleInvalid="skip"
)
gold_df=spark.read.table("workspace.ecommerce.ml_training_gold")
assembled_df = assembler.transform(gold_df)
final_df = assembled_df.select("features", "purchased")

train_data, test_data = final_df.randomSplit([0.8, 0.2], seed=42)

In [0]:
#spark.sql("SELECT * FROM workspace.ecommerce.ml_training_gold").printSchema()
assembled_df.printSchema()
   

In [0]:
evaluator = BinaryClassificationEvaluator(labelCol="purchased")

with mlflow.start_run(run_name="Logistic_Baseline"):

    lr = LogisticRegression(
        labelCol="purchased",
        featuresCol="features"
    )

    model = lr.fit(train_data)
    predictions = model.transform(test_data)

    auc = evaluator.evaluate(predictions)

    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("regParam", lr.getRegParam())
    mlflow.log_param("elasticNetParam", lr.getElasticNetParam())
    mlflow.log_metric("AUC", auc)

    mlflow.spark.log_model(
        model,
        "model",
        dfs_tmpdir="/Volumes/workspace/default/my_volume/"
    )

    print("Logged Logistic Baseline AUC:", auc)
   

In [0]:
from pyspark.ml.classification import RandomForestClassifier

with mlflow.start_run(run_name="RF_Tuned"):

    rf = RandomForestClassifier(
        labelCol="purchased",
        featuresCol="features",
        numTrees=100,
        maxDepth=10
    )

    model = rf.fit(train_data)
    predictions = model.transform(test_data)

    auc = evaluator.evaluate(predictions)

    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("numTrees", 100)
    mlflow.log_param("maxDepth", 10)

    # Log metric
    mlflow.log_metric("AUC", auc)

    # Log model
    mlflow.spark.log_model(model, "model")

    print("Logged RF Tuned AUC:", auc)

In [0]:
mlflow.end_run()