In [11]:
# 1) Montar o Drive
from google.colab import drive
drive.mount('/content/drive')

# 2) SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# 3) Caminhos no Drive
base_path = "/content/drive/MyDrive/Eixo_05/dados/"

# 4) Carregar os dados featurizados (Parquet)
HTFfeaturizedData   = spark.read.parquet(base_path + "HTFfeaturizedData")
TFIDFfeaturizedData = spark.read.parquet(base_path + "TFIDFfeaturizedData")
W2VfeaturizedData   = spark.read.parquet(base_path + "W2VfeaturizedData")

# Nome amigável p/ aparecer no print dentro do treino
HTFfeaturizedData.name   = "HTFfeaturizedData"
TFIDFfeaturizedData.name = "TFIDFfeaturizedData"
W2VfeaturizedData.name   = "W2VfeaturizedData"

# Verificar contagens
print("Contagens:",
      HTFfeaturizedData.count(),
      TFIDFfeaturizedData.count(),
      W2VfeaturizedData.count())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Contagens: 50000 50000 50000


In [12]:

from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Classe para treinar e avaliar o modelo

def TreinaAvaliaModelo(spark, classifier, features, classes, train, test):

    # Método para definir o tipo de classificador
    def FindMtype(classifier):
        M = classifier
        Mtype = type(M).__name__
        return Mtype

    # Cria instância da classe
    Mtype = FindMtype(classifier)

    # Método para o treinamento do modelo
    def IntanceFitModel(Mtype, classifier, classes, features, train):
        if Mtype in ("LogisticRegression",):
            # Grid de hiperparâmetros para otimização
            paramGrid = (ParamGridBuilder()
                         .addGrid(classifier.maxIter, [10, 15, 20])
                         .build())

            # Validação cruzada para otimização de hiperparâmetros
            crossval = CrossValidator(
                estimator=classifier,
                estimatorParamMaps=paramGrid,
                evaluator=MulticlassClassificationEvaluator(),
                numFolds=2
            )

            # Cria objeto de treinamento
            fitModel = crossval.fit(train)
            return fitModel
        else:
            # Caso adicione outros classificadores, tratar aqui
            raise ValueError(f"Classificador não suportado neste snippet: {Mtype}")

    # Treinamento do modelo
    fitModel = IntanceFitModel(Mtype, classifier, classes, features, train)

    # Extrai previsões do modelo com dados de teste
    predictions = fitModel.transform(test)

    # Cria o avaliador
    MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # Calcula a acurácia
    accuracy = (MC_evaluator.evaluate(predictions)) * 100

    # Print simples da acurácia
    try:
        feat_name = getattr(train, 'name', 'features')
    except Exception:
        feat_name = 'features'
    print(f"{Mtype} | {feat_name} -> Accuracy: {accuracy:.2f}%")

    # Estabelece colunas da tabela que irá comparar os resultados de cada classificador
    columns = ['Classifier', 'Result']

    # Gera o resultado
    ctype = [Mtype]
    score = [f"{accuracy:.2f}"]
    result = spark.createDataFrame(zip(ctype, score), schema=columns)

    return result




In [13]:
# Função para criar o modelo de Machine Learning

def cria_modelos_ml(spark, HTFfeaturizedData, TFIDFfeaturizedData, W2VfeaturizedData):

    # Usaremos apenas um classificador (mantendo o original)
    classifiers = [LogisticRegression()]

    # Lista de atributos
    featureDF_list = [HTFfeaturizedData, TFIDFfeaturizedData, W2VfeaturizedData]

    # Loop por cada atributo
    for featureDF in featureDF_list:
        # Divisão de treino e teste
        train, test = featureDF.randomSplit([0.7, 0.3], seed=11)

        # Nomes dos atributos (mantém a ideia original de usar .name)
        try:
            train.name = featureDF.name
        except Exception:
            # ignora se o atributo não existir
            pass

        # Atributos no formato Spark (dado de entrada)
        features = featureDF.select(['features']).collect()

        # Classes (dado de saída)
        classes = featureDF.select("label").distinct().count()

        # Lista de colunas
        columns = ['Classifier', 'Result']

        # Lista de termos
        vals = [("Place Holder", "N/A")]

        # Cria o dataframe
        results = spark.createDataFrame(vals, columns)

        # Loop pela lista de classificadores
        for classifier in classifiers:
            # Cria objeto da classe
            new_result = TreinaAvaliaModelo(
                spark,
                classifier,
                features,
                classes,
                train,
                test,
            )

            # Gera o resultado
            results = results.union(new_result)
            results = results.where("Classifier!='Place Holder'")

        # Exibe resultado para cada featurização
        results.show(truncate=False)

    # Retorno opcional: o último `results` mostrado
    return results


In [14]:
results = cria_modelos_ml(
    spark,
    HTFfeaturizedData,
    TFIDFfeaturizedData,
    W2VfeaturizedData
)


LogisticRegression | HTFfeaturizedData -> Accuracy: 69.46%
+------------------+------+
|Classifier        |Result|
+------------------+------+
|LogisticRegression|69.46 |
+------------------+------+

LogisticRegression | TFIDFfeaturizedData -> Accuracy: 69.46%
+------------------+------+
|Classifier        |Result|
+------------------+------+
|LogisticRegression|69.46 |
+------------------+------+

LogisticRegression | W2VfeaturizedData -> Accuracy: 83.93%
+------------------+------+
|Classifier        |Result|
+------------------+------+
|LogisticRegression|83.93 |
+------------------+------+



In [15]:
best = (results
        .withColumn("Acc", results["Result"].cast("double"))
        .orderBy("Acc", ascending=False)
        .limit(1))
best.show(truncate=False)


+------------------+------+-----+
|Classifier        |Result|Acc  |
+------------------+------+-----+
|LogisticRegression|83.93 |83.93|
+------------------+------+-----+

