In [2]:
import mlflow
import mlflow.spark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.ml.functions import vector_to_array

# =========================================================
# 1. SETUP & DATA LOADING
# =========================================================
mlflow.set_experiment("Retailer_Credit_Risk_Model")

print("=" * 60)
print("STARTING MODEL TRAINING (WITH FULL METRICS)")
print("=" * 60)

df = spark.table("gold_ml_features")

# Filter for active users only (Training Data)
train_data = df.filter(F.col("is_default").isNotNull())
full_dataset = df

print(f"Active Training Set: {train_data.count():,}")

# =========================================================
# 2. FEATURE ENGINEERING (No Leakage)
# =========================================================
feature_cols = [
    # SPENDING BEHAVIOR
    "total_order_value",
    "avg_order_value",
    "order_value_std",
    "total_orders",
    
    # RISK INDICATORS (Safe ones)
    "credit_utilization", 
    "account_age_days",    
    "shop_type_encoded"    
]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="keep")

rf = RandomForestClassifier(
    labelCol="is_default", 
    featuresCol="features", 
    numTrees=100, 
    maxDepth=10,
    seed=42
)

pipeline = Pipeline(stages=[assembler, rf])

# =========================================================
# 3. TRAINING & EVALUATION
# =========================================================
(train_set, test_set) = train_data.randomSplit([0.8, 0.2], seed=42)

print("\nStarting Training...")

with mlflow.start_run(run_name="RandomForest_Detailed_Metrics") as run:
    
    model = pipeline.fit(train_set)
    predictions = model.transform(test_set)
    
    # --- METRIC 1: AUC (Discrimination Power) ---
    evaluator_auc = BinaryClassificationEvaluator(labelCol="is_default", metricName="areaUnderROC")
    auc = evaluator_auc.evaluate(predictions)
    
    # --- METRIC 2: PRECISION, RECALL, F1, ACCURACY ---
    # We use MulticlassClassificationEvaluator for these specific metrics
    evaluator_multi = MulticlassClassificationEvaluator(labelCol="is_default")
    
    accuracy = evaluator_multi.evaluate(predictions, {evaluator_multi.metricName: "accuracy"})
    precision = evaluator_multi.evaluate(predictions, {evaluator_multi.metricName: "weightedPrecision"})
    recall = evaluator_multi.evaluate(predictions, {evaluator_multi.metricName: "weightedRecall"})
    f1 = evaluator_multi.evaluate(predictions, {evaluator_multi.metricName: "f1"})
    
    print("\n" + "-"*30)
    print("üèÜ MODEL PERFORMANCE REPORT")
    print("-"*30)
    print(f"AUC Score:  {auc:.2%}")
    print(f"Accuracy:   {accuracy:.2%}")
    print(f"Precision:  {precision:.2%} (Reliability of 'Defaulter' label)")
    print(f"Recall:     {recall:.2%}    (How many Defaulters we caught)")
    print(f"F1 Score:   {f1:.2%}        (Balance between Precision/Recall)")
    print("-"*30)
    
    # --- METRIC 3: CONFUSION MATRIX (Raw Counts) ---
    # 0=Good, 1=Bad
    print("\nConfusion Matrix (How many did we get right?):")
    predictions.groupBy("is_default", "prediction").count().show()

    # Log everything to MLflow
    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("Accuracy", accuracy)
    mlflow.log_metric("Precision", precision)
    mlflow.log_metric("Recall", recall)
    mlflow.log_metric("F1", f1)
    
    mlflow.spark.log_model(model, "credit_risk_model")
    print("‚úì Metrics logged to MLflow.")

# =========================================================
# 4. SCORING & SEGMENTATION
# =========================================================
print("\nScoring Full Dataset...")

raw_predictions = model.transform(full_dataset)

final_scored = raw_predictions \
    .withColumn("risk_probability", 
                F.when(F.col("probability").isNotNull(),
                       vector_to_array("probability")[1])
                 .otherwise(None)) \
    .withColumn("predicted_risk_segment", 
    F.when((F.col("total_orders") == 0) | (F.col("total_orders").isNull()), "Unrated / New")
     .when(F.col("risk_probability") > 0.6, "High Risk")
     .when(F.col("risk_probability") > 0.3, "Medium Risk")
     .otherwise("Low Risk")
    ) \
    .select("retailer_id", "total_orders", "risk_probability", "predicted_risk_segment")

# =========================================================
# 5. SAVE
# =========================================================
final_scored.write.format("delta").mode("overwrite").saveAsTable("ML_Retailer_Scores_Final")

print("\nFinal Segment Breakdown:")
display(final_scored.groupBy("predicted_risk_segment").count().orderBy("count"))

StatementMeta(, f00030af-633d-404f-98fd-8fdff5a059e1, 4, Finished, Available, Finished)

STARTING MODEL TRAINING (WITH FULL METRICS)
Active Training Set: 4,667

Starting Training...

------------------------------
üèÜ MODEL PERFORMANCE REPORT
------------------------------
AUC Score:  82.50%
Accuracy:   97.86%
Precision:  95.77% (Reliability of 'Defaulter' label)
Recall:     97.86%    (How many Defaulters we caught)
F1 Score:   96.81%        (Balance between Precision/Recall)
------------------------------

Confusion Matrix (How many did we get right?):
+----------+----------+-----+
|is_default|prediction|count|
+----------+----------+-----+
|         1|       0.0|   19|
|         0|       0.0|  870|
+----------+----------+-----+

‚úì Metrics logged to MLflow.





Scoring Full Dataset...

Final Segment Breakdown:


SynapseWidget(Synapse.DataFrame, 5d4aff8f-fbc8-460a-a426-49cf76567cc1)

In [3]:
# üïµÔ∏è‚Äç‚ôÇÔ∏è DETECTIVE MODE
# Let's look at the "Unrated" people who somehow have money
ghost_check = final_scored.filter(
    (F.col("predicted_risk_segment") == "Unrated / New") & 
    (F.col("total_orders") > 0)  # Should be impossible based on your code!
)

print(f"Number of 'Impossible' rows: {ghost_check.count()}")

if ghost_check.count() > 0:
    print("‚ö†Ô∏è Found rows where logic failed! Sample:")
    ghost_check.show(5)
else:
    print("‚úÖ Logic is perfect. The DataFrame is clean.")
    print("If Dashboard is wrong, the table on disk wasn't updated.")

StatementMeta(, f00030af-633d-404f-98fd-8fdff5a059e1, 5, Finished, Available, Finished)

Number of 'Impossible' rows: 0
‚úÖ Logic is perfect. The DataFrame is clean.
If Dashboard is wrong, the table on disk wasn't updated.


In [3]:
scores = spark.table("ml_retailer_scores_final")

StatementMeta(, 77f7dc32-b60d-4f76-a5b5-a568c6eebb05, 5, Finished, Available, Finished)

In [4]:
display(scores)

StatementMeta(, 77f7dc32-b60d-4f76-a5b5-a568c6eebb05, 6, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 87496b12-be63-4cf8-a3fe-0c9293735aad)