
# Churn prediction Model

This notebook defines a function which takes in a RFM dataframe and trains (and evaluates) a Logistic regression model to predict churn.

In [0]:
# Import libraries

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql import DataFrame
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import mlflow
import mlflow.spark

In [0]:
rfm_data = spark.read.format("delta").load("/Volumes/portfolio/cltv_schema/portfolio_crm/rfm_data")

In [0]:
def lr_model(rfm_data: DataFrame, churn_recency: int = 90, model_path: str = "/Volumes/portfolio/cltv_schema/portfolio_crm/churn_model"):
    rfm = rfm_data.withColumn("Churn", (rfm_data["Recency"] > churn_recency).cast("integer"))

    features = ["Recency", "Frequency", "Monetary"]
    assembler = VectorAssembler(inputCols=features, outputCol="features")

    lr = LogisticRegression(featuresCol="features", labelCol="Churn")

    pipeline = Pipeline(stages=[assembler, lr])

    train, test = rfm.randomSplit([0.8, 0.2])
    model = pipeline.fit(train)

    model.write().overwrite().save(model_path)

     # Evaluate
    preds = model.transform(test)
    roc_auc = BinaryClassificationEvaluator(labelCol="Churn", metricName="areaUnderROC").evaluate(preds)
    pr_auc = BinaryClassificationEvaluator(labelCol="Churn", metricName="areaUnderPR").evaluate(preds)
    accuracy = MulticlassClassificationEvaluator(labelCol="Churn", metricName="accuracy").evaluate(preds)
    print(f"ROC AUC: {roc_auc}, PR AUC: {pr_auc}, Accuracy: {accuracy}")

    return model

In [0]:
lr_model(rfm_data)