In [None]:
# -*- coding: utf-8 -*-
"""
BERTimbau — pipeline padronizado ao BERTweetBR
- Mesma máscara de emojis/emoticons (anti-atalho)
- Split por leak_key (sem vazamento)
- Tokenização + padding dinâmico
- Class weights + label smoothing na loss
- Early stopping (F1), load_best_model_at_end
- Salvamento de métricas e matriz de confusão
"""

!pip -q install "transformers>=4.41,<4.47" datasets accelerate \
                scikit-learn emoji unidecode imbalanced-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, re, json, random
import numpy as np
import pandas as pd
from typing import Dict
import torch
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

from datasets import Dataset, DatasetDict

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback, set_seed
)

from emoji import replace_emoji, demojize
from unidecode import unidecode


In [None]:
# Colab Drive (opcional)
# ----------------------------
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
except Exception as e:
    print("Aviso: não foi possível montar o Drive automaticamente. Se estiver fora do Colab, ignore.")
from datetime import datetime


Mounted at /content/drive


In [None]:
CSV_PATH  = "/content/drive/MyDrive/versaochat/twitter-sentiment-pt-BR-md-2-l.csv"  # <- ajuste se precisar
RUN_TS    = datetime.now().strftime("%Y%m%d-%H%M%S")
BASE_DIR  = "/content/drive/MyDrive/versaochat"
OUT_DIR   = f"{BASE_DIR}/bertimbau-finetuned-sentiment-{RUN_TS}"
os.makedirs(OUT_DIR, exist_ok=True)
TEXT_COL   = "tweet_text"
LABEL_COL  = "sentiment"
MODEL_NAME = "neuralmind/bert-base-portuguese-cased"
SEED       = 42

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
print("OUT_DIR:", OUT_DIR)

Device: cuda
OUT_DIR: /content/drive/MyDrive/versaochat/bertimbau-finetuned-sentiment-20250820-161230


In [None]:
# 2) Carregamento do CSV
# ============================
def load_csv(path: str) -> pd.DataFrame:
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin-1")

df = load_csv(CSV_PATH).copy()
assert TEXT_COL in df.columns and LABEL_COL in df.columns, f"Esperado colunas: {TEXT_COL}, {LABEL_COL}"
df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()
df[TEXT_COL] = df[TEXT_COL].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

# Rótulos 0/1 -> cria coluna 'labels' (padrão do Trainer)
if not pd.api.types.is_integer_dtype(df[LABEL_COL]):
    raise ValueError(f"A coluna {LABEL_COL} deve ser int (0/1).")
df["labels"] = df[LABEL_COL].astype(int)

num_labels = len(sorted(df["labels"].unique()))
assert num_labels in (2,3), f"num_labels inesperado: {num_labels}"
print("Contagem de classes:", df["labels"].value_counts().to_dict())

Contagem de classes: {0: 10001, 1: 9999}


In [None]:
# 3) Máscara de emojis/emoticons (anti-atalho)
# ============================
_POS = re.compile(r'(:\)|:-\)|=\)|:\]|:D|:-D|;\)|;-\)|:P|:-P|<3+)', re.I)
_NEG = re.compile(r'(:\(|:-\(|=\(|:\[|:\'\(|D:)', re.I)

def mask_emotes_and_emojis(s: str) -> str:
    s = str(s)
    s = replace_emoji(s, replace=" EMOJI ")  # todos emojis -> EMOJI
    s = _POS.sub(" EMOTE ", s)               # emoticons positivos -> EMOTE
    s = _NEG.sub(" EMOTE ", s)               # emoticons negativos -> EMOTE
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["text_masked"] = df[TEXT_COL].apply(mask_emotes_and_emojis)


In [None]:
# 4) Split sem vazamento por grupos (leak_key)
# ============================
_url_re   = re.compile(r"http\S+|www\.\S+", re.IGNORECASE)
_mention  = re.compile(r"@\w+")
_ws       = re.compile(r"\s+")

def norm_for_leak_key(s: str) -> str:
    s = s.strip()
    s = _url_re.sub(" ", s)
    s = _mention.sub(" ", s)
    s = demojize(s, language="pt")  # segurança extra
    s = unidecode(s).lower()
    s = _ws.sub(" ", s)
    return s.strip()

df["leak_key"] = df["text_masked"].apply(norm_for_leak_key)

rep = df.drop_duplicates(subset=["leak_key"])[["leak_key","labels"]].copy()
rep_temp, rep_test = train_test_split(rep, test_size=0.10, random_state=SEED, stratify=rep["labels"])
rep_train, rep_val = train_test_split(rep_temp, test_size=0.1111, random_state=SEED, stratify=rep_temp["labels"])

key2split = {k:"train" for k in rep_train["leak_key"]}
key2split.update({k:"val" for k in rep_val["leak_key"]})
key2split.update({k:"test" for k in rep_test["leak_key"]})
df["split"] = df["leak_key"].map(key2split)
assert df["split"].isna().sum()==0

def _inter(a,b): return len(set(a).intersection(set(b)))
lk_tr = df.query("split=='train'")["leak_key"]
lk_va = df.query("split=='val'")["leak_key"]
lk_te = df.query("split=='test'")["leak_key"]
print("Leak check (esperado 0,0,0):", _inter(lk_tr, lk_va), _inter(lk_tr, lk_te), _inter(lk_va, lk_te))

Leak check (esperado 0,0,0): 0 0 0


In [None]:
# 5) Dataset HF (texto mascarado)
# ============================
ds = DatasetDict({
    "train": Dataset.from_pandas(
        df[df["split"]=="train"][["text_masked","labels"]]
          .rename(columns={"text_masked":"text"})
          .reset_index(drop=True)
    ),
    "validation": Dataset.from_pandas(
        df[df["split"]=="val"][["text_masked","labels"]]
          .rename(columns={"text_masked":"text"})
          .reset_index(drop=True)
    ),
    "test": Dataset.from_pandas(
        df[df["split"]=="test"][["text_masked","labels"]]
          .rename(columns={"text_masked":"text"})
          .reset_index(drop=True)
    ),
})


In [None]:
# 6) Tokenizador + tokenização
# ============================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)  # sem normalization (não existe para BERTimbau)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=128)

ds_tok = ds.map(tokenize_batch, batched=True, desc="Tokenizando (mascarado)")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenizando (mascarado):   0%|          | 0/15980 [00:00<?, ? examples/s]

Tokenizando (mascarado):   0%|          | 0/2008 [00:00<?, ? examples/s]

Tokenizando (mascarado):   0%|          | 0/2012 [00:00<?, ? examples/s]

In [None]:
# 7) Modelo + Trainer (class weights, early stopping)
# ============================
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label={i:str(i) for i in range(num_labels)},
    label2id={str(i):i for i in range(num_labels)},
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(device)

# Pesos de classe
y_train = np.array(ds_tok["train"]["labels"])
classes_sorted = np.unique(y_train)
class_weights = compute_class_weight(class_weight="balanced", classes=classes_sorted, y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Trainer com CE + class weights + label smoothing consistente
class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits  = outputs.get("logits")
        loss = nn.functional.cross_entropy(
            logits.view(-1, model.config.num_labels),
            labels.view(-1),
            weight=self.class_weights,
            label_smoothing=0.05  # mantenha igual ao BERTweetBR; ou 0.0 em ambos
        )
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir=OUT_DIR,
    evaluation_strategy="epoch",      # <- CORRIGIDO (antes: eval_strategy)
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",  # <- CORRIGIDO (antes: "f1")
    greater_is_better=True,

    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=6,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    gradient_accumulation_steps=1,
    max_grad_norm=1.0,

    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    report_to="none",
    seed=SEED,
    save_total_limit=2               # opcional, evita muitos checkpoints
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": p,
        "recall": r,
        "f1": f1
    }

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tokenizer,              # <- OK
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.0  # opcional, explícito
    )],
    class_weights=class_weights,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


In [None]:
# 8) Treino
# ============================
train_result = trainer.train()
print("Melhor checkpoint:", trainer.state.best_model_checkpoint)

# ============================
# 9) Avaliação + Relatórios + Salvamento
# ============================
print("\n== VAL (mascarado) ==")
val_metrics = trainer.evaluate(ds_tok["validation"])
val_metrics = {k: float(v) for k, v in val_metrics.items()}
print({k: round(v, 4) for k, v in val_metrics.items()})

print("\n== TEST (mascarado) ==")
test_metrics = trainer.evaluate(ds_tok["test"])
test_metrics = {k: float(v) for k, v in test_metrics.items()}
print({k: round(v, 4) for k, v in test_metrics.items()})

pred_test = trainer.predict(ds_tok["test"])
y_true = pred_test.label_ids
y_pred = np.argmax(pred_test.predictions, axis=-1)

print("\nClassification report (TEST):")
print(classification_report(y_true, y_pred, target_names=[str(i) for i in range(num_labels)], zero_division=0))
cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix:\n", cm)

# salva métricas em JSON e CM em CSV
with open(os.path.join(OUT_DIR, "metrics_val.json"), "w") as f:
    json.dump(val_metrics, f, indent=2)
with open(os.path.join(OUT_DIR, "metrics_test.json"), "w") as f:
    json.dump(test_metrics, f, indent=2)

pd.DataFrame(cm, columns=[f"pred_{i}" for i in range(num_labels)], index=[f"true_{i}" for i in range(num_labels)])\
  .to_csv(os.path.join(OUT_DIR, "confusion_matrix_test.csv"), index=True)

# salva modelo + tokenizer
from pathlib import Path
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(OUT_DIR)
model.save_pretrained(OUT_DIR)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4207,0.40909,0.829681,0.829765,0.829681,0.829641
2,0.3495,0.394483,0.849602,0.849724,0.849602,0.849561
3,0.2604,0.513095,0.836155,0.83628,0.836155,0.836108
4,0.1872,0.54937,0.837151,0.837334,0.837151,0.837091


Melhor checkpoint: /content/drive/MyDrive/versaochat/bertimbau-finetuned-sentiment-20250820-161230/checkpoint-1998

== VAL (mascarado) ==


{'eval_loss': 0.3945, 'eval_accuracy': 0.8496, 'eval_precision': 0.8497, 'eval_recall': 0.8496, 'eval_f1': 0.8496, 'eval_runtime': 2.5179, 'eval_samples_per_second': 797.496, 'eval_steps_per_second': 25.021, 'epoch': 4.0}

== TEST (mascarado) ==
{'eval_loss': 0.3894, 'eval_accuracy': 0.8439, 'eval_precision': 0.8454, 'eval_recall': 0.8439, 'eval_f1': 0.8438, 'eval_runtime': 2.7134, 'eval_samples_per_second': 741.51, 'eval_steps_per_second': 23.218, 'epoch': 4.0}

Classification report (TEST):
              precision    recall  f1-score   support

           0       0.82      0.87      0.85       995
           1       0.87      0.81      0.84      1017

    accuracy                           0.84      2012
   macro avg       0.85      0.84      0.84      2012
weighted avg       0.85      0.84      0.84      2012

Confusion matrix:
 [[870 125]
 [189 828]]
