In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.base import BaseEstimator, ClassifierMixin
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import random
import os
import datetime

In [None]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
PRETRAINED_MODEL_DIR = os.path.join(BASE_DIR, 'model', 'pre-trained')
DATASET_DIR = os.path.join(BASE_DIR, 'dataset')
DATASET_PT_CSV = os.path.join(DATASET_DIR, 'dataset_pt.csv')
DATASET_EN_CSV = os.path.join(DATASET_DIR, 'dataset_en.csv')
MODEL_PATH_EN = os.path.join(PRETRAINED_MODEL_DIR, 'bert-base-cased')
MODEL_PATH_PT = os.path.join(PRETRAINED_MODEL_DIR, 'bert-base-portuguese-cased')
OUTPUT_DIR = os.path.join(BASE_DIR, 'data', 'output')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
LANGUAGES = ["pt", "en"]
PERIODS = ["before_2016", "after_2016", "full"]
SEED = 42
N_REP = 1

param_grid = {
    "batch_size": [8, 32],
    "learning_rate": [1e-4, 5e-5],
    "epochs": [3, 10, 20]
}

scorer = {
    "f1_macro": make_scorer(f1_score, average="macro"),
    "accuracy": make_scorer(accuracy_score)
}

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [4]:
def load_dataset(language: str) -> pd.DataFrame:
    if language == "pt":
        df = pd.read_csv(DATASET_PT_CSV)
        model_path = MODEL_PATH_PT
    else:
        df = pd.read_csv(DATASET_EN_CSV)
        model_path = MODEL_PATH_EN
    return df, model_path

In [5]:
def filter_by_period(df: pd.DataFrame, period: str) -> pd.DataFrame:
    df['data'] = pd.to_datetime(df['data'], format='%Y%m%d', errors='coerce')

    cutoff_date = pd.to_datetime('2016-01-01')

    if period == "before_2016":
        return df[df["data"] < cutoff_date]
    elif period == "after_2016":
        return df[df["data"] >= cutoff_date]
    else:  # full
        return df

In [6]:
class BertClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model_name, learning_rate=5e-5, batch_size=16, epochs=3,
                 num_labels=3, max_length=512, training_args_kwargs=None):
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.num_labels = num_labels
        self.max_length = max_length
        self.training_args_kwargs = training_args_kwargs if training_args_kwargs is not None else {}
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = None
        self.trainer = None
        self.output_dir = None

    def _tokenize(self, X, y=None):
        encodings = self.tokenizer(
            list(X),
            truncation=True,
            padding="max_length",
            max_length=self.max_length
        )
        dataset_dict = {"input_ids": encodings["input_ids"], "attention_mask": encodings["attention_mask"]}
        if y is not None:
            dataset_dict["labels"] = list(y)
        return Dataset.from_dict(dataset_dict)

    def fit(self, X, y):
        train_dataset = self._tokenize(X, y)

        self.model = BertForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels)

        training_args = TrainingArguments(
            output_dir="./results",
            learning_rate=self.learning_rate,
            per_device_train_batch_size=self.batch_size,
            num_train_epochs=self.epochs,
            weight_decay=0.01,
            logging_dir="./logs",
            save_strategy="no",
            report_to="none",
            disable_tqdm=True
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
        )

        self.trainer.train()
        return self

    def predict(self, X):
        test_dataset = self._tokenize(X)
        preds = self.trainer.predict(test_dataset)
        return np.argmax(preds.predictions, axis=1)


In [None]:
results = []

label_mapping = {-1: 0, 0: 1, 1: 2}

for lang in LANGUAGES:
    dataset, model = load_dataset(lang)

    for period in PERIODS:
        df = filter_by_period(dataset, period)
        X = df["texto_ata"].values
        y = df["decisao_n+1"]

        y_encoded = y.map(label_mapping)

        y_encoded_array = y_encoded.values 

        for i in range(N_REP):

            X_train, X_test, y_train, y_test = train_test_split(X, y_encoded_array,train_size=0.8, test_size=0.2, random_state=SEED+i, stratify=y_encoded_array)

            clf = BertClassifier(model_name=model)

            grid = GridSearchCV(
                estimator=clf,
                param_grid=param_grid,
                scoring=scorer,
                refit="f1_macro",
                cv=5,
                n_jobs=1
            )

            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_

            y_pred = best_model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="macro")

            results.append({
                "language": lang,
                "period": period,
                "repetition": i + 1,
                "best_params": grid.best_params_,
                "test_accuracy": acc,
                "test_f1_macro": f1
            })

results_df = pd.DataFrame(results)

data_str = datetime.datetime.now().strftime("%Y%m%d%H%M")
file_name = f"results_{N_REP}_{data_str}.csv"

output_path = os.path.join(OUTPUT_DIR, file_name)
results_df.to_csv(output_path, index=False)
print(f"Resultados salvos em: {output_path}")