In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import (
    LogisticRegressionModel, 
    RandomForestClassificationModel, 
    DecisionTreeClassificationModel, 
    GBTClassificationModel
)
from pyspark.sql.functions import col

# Load slicing model and test set
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")
test_ready   = spark.read.format("delta").load("/FileStore/data/test_ready")
test_topk    = slicer_model.transform(test_ready)

# Setup evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="f1"
)

# Model paths
model_paths = {
    "Logistic Regression": "/FileStore/models/lr_top10_model_grid",
    "Random Forest": "/FileStore/models/rf_top10_model_grid",
    "Decision Tree": "/FileStore/models/dt_top10_model_grid",
    "GBT": "/FileStore/models/gbt_top10_weighted_model"
}

# Corresponding model classes
model_classes = {
    "Logistic Regression": LogisticRegressionModel,
    "Random Forest": RandomForestClassificationModel,
    "Decision Tree": DecisionTreeClassificationModel,
    "GBT": GBTClassificationModel
}

# Store F1 scores
f1_scores = []

# Evaluate each model
for model_name, path in model_paths.items():
    print(f"\n🔍 Evaluating {model_name}...")

    model_class = model_classes[model_name]
    model = model_class.load(path)
    
    preds = model.transform(test_topk)
    
    # Overall weighted F1
    f1 = evaluator.evaluate(preds)
    
    # Minority class F1, Precision, Recall
    preds_rdd = preds.select("prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    metrics = MulticlassMetrics(preds_rdd)
    
    f1_minority = metrics.fMeasure(1.0)
    precision_minority = metrics.precision(1.0)
    recall_minority = metrics.recall(1.0)
    
    print(f"{model_name} - Overall F1: {f1:.4f}")
    print(f"{model_name} - Minority class -> F1: {f1_minority:.4f}, Precision: {precision_minority:.4f}, Recall: {recall_minority:.4f}")
    
    f1_scores.append((model_name, f1_minority))

# Sort and display results
f1_scores.sort(key=lambda x: x[1], reverse=True)

print("\n📊 Final model comparison (sorted by F1-score):")
for name, score in f1_scores:
    print(f"{name:20} ➤  F1-score: {score:.4f}")



🔍 Evaluating Logistic Regression...
Logistic Regression - Overall F1: 0.9462
Logistic Regression - Minority class -> F1: 0.0000, Precision: 0.0000, Recall: 0.0000

🔍 Evaluating Random Forest...
Random Forest - Overall F1: 0.9462
Random Forest - Minority class -> F1: 0.0000, Precision: 0.0000, Recall: 0.0000

🔍 Evaluating Decision Tree...
Decision Tree - Overall F1: 0.9462
Decision Tree - Minority class -> F1: 0.0000, Precision: 0.0000, Recall: 0.0000

🔍 Evaluating GBT...
GBT - Overall F1: 0.3050
GBT - Minority class -> F1: 0.0842, Precision: 0.0439, Recall: 0.9997

📊 Final model comparison (sorted by F1-score):
GBT                  ➤  F1-score: 0.0842
Logistic Regression  ➤  F1-score: 0.0000
Random Forest        ➤  F1-score: 0.0000
Decision Tree        ➤  F1-score: 0.0000
