In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Carregar modelos e dados
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready = spark.read.format("delta").load("/FileStore/data/val_ready")

# Aplicar slicer
train_topk = slicer_model.transform(train_ready)
val_topk = slicer_model.transform(val_ready)

# Balanceamento leve
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced = majority_df.sample(False, 0.8, seed=42).union(minority_df)

# Verificar distribuição após balanceamento
print("\nDistribuição após balanceamento:")
train_balanced.groupBy("label").count().show()

# Calcular pesos por classe
label_counts = train_balanced.groupBy("label").count().collect()
label_dict = {row["label"]: row["count"] for row in label_counts}
total = sum(label_dict.values())
class_weights = {label: total / count for label, count in label_dict.items()}

# Criar UDF para aplicar pesos
def get_weight(label):
    return float(class_weights[label])

weight_udf = F.udf(get_weight, DoubleType())

# Adicionar coluna de pesos
train_weighted = train_balanced.withColumn("classWeightCol", weight_udf(col("label")))

# ➡️ Substituir RF por Logistic Regression
lr = LogisticRegression(
    labelCol="label",
    featuresCol="features",
    weightCol="classWeightCol",  # muito importante!
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # Ridge (L2), pode ajustar
)

# Treinar modelo
model = lr.fit(train_weighted)

# Inferência no conjunto de validação
val_preds = model.transform(val_topk)

# Função para aplicar threshold
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def apply_threshold(df, threshold):
    predict_udf = udf(lambda prob: float(1.0) if prob[1] > threshold else float(0.0), DoubleType())
    return df.withColumn("adjusted_prediction", predict_udf(col("probability")))

# GridSearch pelo melhor threshold
best_f1 = 0
best_threshold = 0.5

print("\n🔍 Threshold Search (para F1 classe 1):")
for t in [x / 100.0 for x in range(5, 95, 5)]:
    adjusted_df = apply_threshold(val_preds, t)
    rdd = adjusted_df.select("adjusted_prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    metrics = MulticlassMetrics(rdd)
    f1_class1 = metrics.fMeasure(1.0)
    print(f"Threshold = {t:.2f} | F1 (classe 1): {f1_class1:.4f}")
    if f1_class1 > best_f1:
        best_f1 = f1_class1
        best_threshold = t

print(f"\n✅ Melhor Threshold Encontrado: {best_threshold:.2f} com F1 da classe 1 = {best_f1:.4f}")

# Aplicar melhor threshold
val_preds_adjusted = apply_threshold(val_preds, best_threshold)

# Avaliação final
final_rdd = val_preds_adjusted.select("adjusted_prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
metrics = MulticlassMetrics(final_rdd)

print("\nConfusion Matrix (com melhor threshold):")
print(metrics.confusionMatrix().toArray())

print("\n🎯 Métricas finais com melhor threshold:")
print(f"Precision classe 1: {metrics.precision(1.0):.4f}")
print(f"Recall classe 1:    {metrics.recall(1.0):.4f}")
print(f"F1 classe 1:        {metrics.fMeasure(1.0):.4f}")

# Guardar modelo
model.write().overwrite().save("/FileStore/models/lr_top10_weighted_model")


Distribuição após balanceamento:
+-----+-------+
|label|  count|
+-----+-------+
|  0.0|2699450|
|  1.0| 125441|
+-----+-------+


🔍 Threshold Search (para F1 classe 1):




Threshold = 0.05 | F1 (classe 1): 0.0696




Threshold = 0.10 | F1 (classe 1): 0.0696
Threshold = 0.15 | F1 (classe 1): 0.0696
Threshold = 0.20 | F1 (classe 1): 0.0696
Threshold = 0.25 | F1 (classe 1): 0.0696
Threshold = 0.30 | F1 (classe 1): 0.0696
Threshold = 0.35 | F1 (classe 1): 0.0696
Threshold = 0.40 | F1 (classe 1): 0.0696
Threshold = 0.45 | F1 (classe 1): 0.0702
Threshold = 0.50 | F1 (classe 1): 0.0727
Threshold = 0.55 | F1 (classe 1): 0.0645
Threshold = 0.60 | F1 (classe 1): 0.0282
Threshold = 0.65 | F1 (classe 1): 0.0095
Threshold = 0.70 | F1 (classe 1): 0.0025
Threshold = 0.75 | F1 (classe 1): 0.0004
Threshold = 0.80 | F1 (classe 1): 0.0001
Threshold = 0.85 | F1 (classe 1): 0.0000
Threshold = 0.90 | F1 (classe 1): 0.0000

✅ Melhor Threshold Encontrado: 0.50 com F1 da classe 1 = 0.0727

Confusion Matrix (com melhor threshold):
[[371688. 342233.]
 [ 12772.  13926.]]

🎯 Métricas finais com melhor threshold:
Precision classe 1: 0.0391
Recall classe 1:    0.5216
F1 classe 1:        0.0727
