In [0]:
df = spark.table("ecommerce_ai.silver_customer_features")
df.display()


In [0]:
feature_cols = [
    "recency_days",
    "purchase_frequency",
    "total_spend",
    "avg_order_value"
]

label_col = "churn_label"


In [0]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

ml_df = assembler.transform(df).select("features", label_col)
ml_df.display()


In [0]:
train_df, test_df = ml_df.randomSplit([0.8, 0.2], seed=42)


In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    featuresCol="features",
    labelCol=label_col
)

lr_model = lr.fit(train_df)


In [0]:
predictions = lr_model.transform(test_df)
predictions.select(
    "churn_label", "prediction", "probability"
).display()


In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol=label_col,
    metricName="areaUnderROC"
)

roc_auc = evaluator.evaluate(predictions)
roc_auc


In [0]:
import mlflow
import mlflow.spark

mlflow.set_experiment("/IDC_Codebasics_Hackathon/Customer_Churn_Prediction")


In [0]:
%sql
CREATE CATALOG IF NOT EXISTS ecommerce_ai;
CREATE SCHEMA IF NOT EXISTS ecommerce_ai.ml;

CREATE VOLUME IF NOT EXISTS ecommerce_ai.ml.mlflow_volume;


In [0]:
import os

os.environ["MLFLOW_DFS_TMP"] = "/Volumes/ecommerce_ai/ml/mlflow_volume"


In [0]:
import mlflow
import mlflow.spark

mlflow.set_experiment("/IDC_Codebasics_Hackathon/Customer_Churn_Prediction")

with mlflow.start_run(run_name="Logistic_Regression_Baseline"):
    
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("features", feature_cols)
    mlflow.log_metric("roc_auc", roc_auc)
    
    mlflow.spark.log_model(
        lr_model,
        artifact_path="churn_model"
    )


In [0]:
from mlflow.models.signature import infer_signature

input_df = train_df.select("features").toPandas()
output_df = lr_model.transform(train_df).select("probability").toPandas()

signature = infer_signature(input_df, output_df)

mlflow.spark.log_model(
    lr_model,
    artifact_path="churn_model",
    signature=signature
)
