In [1]:
# Ambiente (Java + PySpark + SparkSession)

# Fecha Spark anterior (se houver)
try:
    spark.stop()
except:
    pass

# Java + PySpark estáveis para Python 3.12
!apt-get update -qq
!apt-get install -y openjdk-17-jdk-headless -qq
!pip -q install -U pyspark==3.5.1

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["PATH"]  = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

from pyspark.sql import SparkSession
spark = (SparkSession.builder
         .appName("eixo05-preprocess")
         .getOrCreate())
print("Spark OK ->", spark.version)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Spark OK -> 3.5.1


In [2]:
# Montar Drive + garantir Spark ativo (sem reinstalar nada)

from google.colab import drive
drive.mount('/content/drive', force_remount=False)

from pyspark.sql import SparkSession
try:
    spark
except NameError:
    spark = SparkSession.builder.getOrCreate()

base_path = "/content/drive/MyDrive/Eixo_05/dados/"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Carregar featurizações

HTFfeaturizedData   = spark.read.parquet(base_path + "HTFfeaturizedData")
TFIDFfeaturizedData = spark.read.parquet(base_path + "TFIDFfeaturizedData")
W2VfeaturizedData   = spark.read.parquet(base_path + "W2VfeaturizedData")

HTFfeaturizedData.name   = "HTFfeaturizedData"
TFIDFfeaturizedData.name = "TFIDFfeaturizedData"
W2VfeaturizedData.name   = "W2VfeaturizedData"

print("Contagens:",
      HTFfeaturizedData.count(),
      TFIDFfeaturizedData.count(),
      W2VfeaturizedData.count())


Contagens: 50000 50000 50000


In [4]:
# Helpers + definindo modelos (LR e SVM)

from pyspark.ml.classification import LogisticRegression, LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

def fit_with_cv(estimator, grid, train, folds=2):
    cv = CrossValidator(
        estimator=estimator,
        estimatorParamMaps=grid,
        evaluator=evaluator,
        numFolds=folds,
        parallelism=2
    )
    return cv.fit(train)

def train_and_eval(dataset):
    # split
    train, test = dataset.randomSplit([0.8, 0.2], seed=42)
    try: train.name = dataset.name
    except: pass

    # nº classes (para decidir SVM binário x OneVsRest)
    n_classes = dataset.select("label").distinct().count()

    # ----- Logistic Regression -----
    lr = LogisticRegression(featuresCol="features", labelCol="label")
    lr_grid = (ParamGridBuilder()
               .addGrid(lr.maxIter, [30])
               .addGrid(lr.regParam, [0.0, 0.01])
               .addGrid(lr.elasticNetParam, [0.0, 0.5])
               .build())
    lr_model = fit_with_cv(lr, lr_grid, train)
    lr_acc = evaluator.evaluate(lr_model.transform(test)) * 100.0

    # ----- Linear SVC (SVM) -----
    svc = LinearSVC(featuresCol="features", labelCol="label")
    if n_classes > 2:
        # Fallback simples e robusto para multiclasse
        svm_est = OneVsRest(featuresCol="features", labelCol="label", classifier=svc)
        svm_grid = ParamGridBuilder().build()  # sem grid para manter simples
        svm_model = fit_with_cv(svm_est, svm_grid, train)
    else:
        svm_grid = (ParamGridBuilder()
                    .addGrid(svc.maxIter, [50])
                    .addGrid(svc.regParam, [0.1, 0.01])
                    .build())
        svm_model = fit_with_cv(svc, svm_grid, train)
    svm_acc = evaluator.evaluate(svm_model.transform(test)) * 100.0

    feat_name = getattr(dataset, "name", "features")
    rows = [
        ("LogisticRegression", feat_name, float(f"{lr_acc:.4f}")),
        ("LinearSVC",         feat_name, float(f"{svm_acc:.4f}")),
    ]
    return spark.createDataFrame(rows, ["Classifier", "Featurization", "Accuracy"])


In [5]:


# Rodar tudo e ver o melhor

results = None
for ds in [HTFfeaturizedData, TFIDFfeaturizedData, W2VfeaturizedData]:
    r = train_and_eval(ds)
    r.show(truncate=False)
    results = r if results is None else results.union(r)

best = results.orderBy(results.Accuracy.desc()).limit(1)
print("\n=== MELHOR COMBO ===")
best.show(truncate=False)


+------------------+-----------------+--------+
|Classifier        |Featurization    |Accuracy|
+------------------+-----------------+--------+
|LogisticRegression|HTFfeaturizedData|87.7588 |
|LinearSVC         |HTFfeaturizedData|89.0819 |
+------------------+-----------------+--------+

+------------------+-------------------+--------+
|Classifier        |Featurization      |Accuracy|
+------------------+-------------------+--------+
|LogisticRegression|TFIDFfeaturizedData|88.3446 |
|LinearSVC         |TFIDFfeaturizedData|90.2535 |
+------------------+-------------------+--------+

+------------------+-----------------+--------+
|Classifier        |Featurization    |Accuracy|
+------------------+-----------------+--------+
|LogisticRegression|W2VfeaturizedData|87.1124 |
|LinearSVC         |W2VfeaturizedData|86.9306 |
+------------------+-----------------+--------+


=== MELHOR COMBO ===
+----------+-------------------+--------+
|Classifier|Featurization      |Accuracy|
+----------+---