In [0]:
import mlflow

model_uri = "runs:/74fd4c9a1e564cb7b735d6c2a28ce597/model"

model = mlflow.spark.load_model(
    model_uri,
    dfs_tmpdir="/Volumes/workspace/default/my_volume/")

In [0]:

gold_df = spark.read.table("workspace.ecommerce.ml_training_gold")

In [0]:
from pyspark.sql.functions import when

labeled_df = gold_df.withColumn(
    "purchased",
    when(gold_df.purchase_count > 0, 1).otherwise(0)
)

In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=[
        "total_events",
        "cart_count",
        "view_count",
        "total_spent",
        "avg_purchase_price"
    ],
    outputCol="features",
    handleInvalid="skip"
)

inference_df = assembler.transform(labeled_df)

In [0]:
predictions = model.transform(inference_df)

In [0]:
from pyspark.sql.functions import col

scored_df = predictions.select(
    "user_id",
    col("probability").getItem(1).alias("purchase_probability")
)

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def extract_probability(v):
    return float(v[1])  # index 1 is positive class

prob_udf = udf(extract_probability, DoubleType())

scored_df = predictions.select(
    "user_id",
    prob_udf(col("probability")).alias("purchase_probability")
)

In [0]:
scored_df.write.format("delta").mode("overwrite").saveAsTable(
    "workspace.ecommerce.predictions_gold"
)

In [0]:
%sql
SELECT * FROM workspace.ecommerce.predictions_gold LIMIT 10;