In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Montar caminho base
base_path = "/content/drive/MyDrive/Eixo_05/dados/"

# Carregar featurizacao
TFIDFfeaturizedData = spark.read.parquet(base_path + "TFIDFfeaturizedData")

# Nome para rastrear no output
TFIDFfeaturizedData.name = "TFIDFfeaturizedData"

print("OK -> Dados carregados:",
      TFIDFfeaturizedData.count(),
      )


OK -> Dados carregados: 50000 50000 50000


In [10]:
# === MÉTRICAS (acurácia, erro, matriz de confusão) ===

from pyspark.ml.classification import LogisticRegression, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Utilizando melhor featurização ja analisada na etapa anterior
ds = TFIDFfeaturizedData

# split fixo p/ comparar sempre no mesmo cenrio
train, test = ds.randomSplit([0.8, 0.2], seed=42)

eval_acc = MulticlassClassificationEvaluator(metricName="accuracy")
eval_f1  = MulticlassClassificationEvaluator(metricName="f1")


In [11]:
def train_eval(name, est, train, test):
    model = est.fit(train)
    preds = model.transform(test).cache()

    acc = eval_acc.evaluate(preds)
    f1  = eval_f1.evaluate(preds)
    err = 1.0 - acc

    print(f"\n=== {name} ({getattr(ds,'name','features')}) ===")
    print(f"Acurácia : {acc*100:.2f}%")
    print(f"Taxa erro: {err*100:.2f}%")
    print(f"F1-score : {f1:.4f}")

    print("Matriz de confusão (label x prediction):")
    (preds.groupBy("label", "prediction")
          .count()
          .orderBy("label", "prediction")
          .show(truncate=False))

# modelos e parâmetros para testar
modelos = [
    ("LogisticRegression",
     LogisticRegression(featuresCol="features", labelCol="label",
                        maxIter=30, regParam=0.01, elasticNetParam=0.0)),
    ("LinearSVC",
     LinearSVC(featuresCol="features", labelCol="label",
               maxIter=50, regParam=0.01))
]

for nome, est in modelos:
    train_eval(nome, est, train, test)



=== LogisticRegression (TFIDFfeaturizedData) ===
Acurácia : 89.16%
Taxa erro: 10.84%
F1-score : 0.8916
Matriz de confusão (label x prediction):
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|0.0  |0.0       |4312 |
|0.0  |1.0       |583  |
|1.0  |0.0       |490  |
|1.0  |1.0       |4516 |
+-----+----------+-----+


=== LinearSVC (TFIDFfeaturizedData) ===
Acurácia : 90.27%
Taxa erro: 9.73%
F1-score : 0.9027
Matriz de confusão (label x prediction):
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|0.0  |0.0       |4354 |
|0.0  |1.0       |541  |
|1.0  |0.0       |422  |
|1.0  |1.0       |4584 |
+-----+----------+-----+

