In [None]:
import sys
print(sys.executable)
print(sys.version)

/venv/main/bin/python
3.12.11 | packaged by conda-forge | (main, Jun  4 2025, 14:45:31) [GCC 13.3.0]


In [None]:
# ============================================================
# 0. Instala√ß√£o (se precisar)
# ============================================================
#!pip install -q transformers datasets accelerate
#!pip install -U datasets
#!pip install  transformers

# ============================================================
# 1. Imports e configura√ß√µes b√°sicas
# ============================================================
import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    AutoConfig,
    BertForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)

# Configura√ß√µes de modelo e dados
TEACHER_NAME = "neuralmind/bert-base-portuguese-cased"
TINY_BASE_NAME = "prajjwal1/bert-tiny"
BRWAC_SLICE = "train"   # ex.: "train[:10]" para teste, "train" para tudo
MAX_LENGTH = 128

# ============================================================
# 2. Device
# ============================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ============================================================
# 3. Carregar e preparar corpus brWaC
# ============================================================

def load_and_filter_brwac(
    split: str = BRWAC_SLICE,
    min_words: int = 10,
    max_words: int = 1000
):
    """
    Carrega uma fatia do corpus brWaC e aplica um filtro simples por tamanho de texto.

    - split: fatia do dataset (ex.: "train[:10]" ou "train")
    - min_words / max_words: limites de quantidade de palavras para manter o exemplo
    """
    print(f"Carregando brWaC ({split})...")
    ds_raw = load_dataset("nlpufg/brwac", split=split)

    def filtra_texto(ex):
        n_words = len(ex["text"].split())
        return (n_words > min_words) and (n_words < max_words)

    ds_filtrado = ds_raw.filter(filtra_texto)
    print("Ap√≥s filtro:", ds_filtrado)
    return ds_filtrado

def train_test_split_dataset(ds, test_size: float = 0.2, seed: int = 42):
    """
    Cria split treino/teste a partir de um Dataset do brWaC.
    """
    print("\nCriando split train/test...")
    split = ds.train_test_split(test_size=test_size, seed=seed)
    ds_train = split["train"]
    ds_test  = split["test"]

    print("Tamanho treino:", len(ds_train))
    print("Tamanho teste :", len(ds_test))
    return ds_train, ds_test

# ============================================================
# 4. Modelos: Teacher (BERTimbau) e Student (BERT-tiny PT)
# ============================================================

def create_teacher(teacher_name: str = TEACHER_NAME):
    """
    Carrega o modelo teacher (BERTimbau-base) para MLM e congela os pesos.
    """
    teacher = AutoModelForMaskedLM.from_pretrained(teacher_name).to(device)
    teacher_tok = AutoTokenizer.from_pretrained(teacher_name)

    teacher.eval()
    for p in teacher.parameters():
        p.requires_grad = False

    print("Teacher carregado:", teacher_name)
    print("Teacher hidden size:", teacher.config.hidden_size)
    return teacher, teacher_tok

def create_student(
    tiny_base_name: str,
    teacher,
    teacher_tok
):
    """
    Cria o student:
      - usa a arquitetura do BERT-tiny (ingl√™s)
      - ajusta vocab_size para o vocabul√°rio do teacher (PT-BR)
      - adiciona camada de proje√ß√£o teacher->student (proj_teacher)
      - usa o tokenizer do teacher como tokenizer do student
    """
    student_config = AutoConfig.from_pretrained(tiny_base_name)
    # vocab do student passa a ser o mesmo do teacher (PT-BR)
    student_config.vocab_size = len(teacher_tok)

    student = BertForMaskedLM(student_config).to(device)
    student_tok = teacher_tok  # student usa vocabul√°rio PT-BR

    # Proje√ß√£o das representa√ß√µes do teacher para o espa√ßo do student
    student.proj_teacher = nn.Linear(
        teacher.config.hidden_size,     # dim teacher
        student.config.hidden_size,     # dim student
        bias=False,
    ).to(device)

    print("Student criado com arquitetura do:", tiny_base_name)
    print("Student hidden size:", student.config.hidden_size)
    print("Vocab size (PT-BR):", student_config.vocab_size)

    return student, student_tok

# ============================================================
# 5. Tokeniza√ß√£o PT (usando tokenizer do student = teacher_tok)
# ============================================================

def tokenize_fn(batch, tokenizer, max_length: int = MAX_LENGTH):
    """
    Tokeniza um batch de textos do brWaC com o tokenizer do student (PT-BR).
    """
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

def tokenize_datasets(ds_train, ds_test, tokenizer):
    """
    Aplica tokeniza√ß√£o em treino e teste e configura formato torch.
    """
    print("Tokenizando treino e teste...")

    ds_train_tok = ds_train.map(
        lambda batch: tokenize_fn(batch, tokenizer),
        batched=True,
        remove_columns=["text"]
    )
    ds_test_tok = ds_test.map(
        lambda batch: tokenize_fn(batch, tokenizer),
        batched=True,
        remove_columns=["text"]
    )

    ds_train_tok.set_format("torch")
    ds_test_tok.set_format("torch")

    print("Exemplo tokenizado:")
    print(ds_train_tok[0])

    return ds_train_tok, ds_test_tok

# ============================================================
# 6. DataCollator para MLM
# ============================================================

def create_data_collator(tokenizer):
    """
    Cria DataCollator para Masked Language Modeling (MLM) com 15% de m√°scara.
    """
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15,
    )

# ============================================================
# 7. Trainer customizado (MLM + KD + Cosine)
# ============================================================
class DistillTrainer(Trainer):
    """
    Trainer customizado que combina:
      - MLM loss do student
      - KD loss (KL-div entre logits do student e teacher com temperatura)
      - Cosine loss entre as representa√ß√µes m√©dias do student e teacher
    """

    def __init__(
        self,
        *args,
        teacher=None,
        temperature: float = 2.0,
        alpha_mlm: float = 1.0,
        alpha_kd: float = 0.5,
        alpha_cos: float = 0.1,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.teacher = teacher
        self.temperature = temperature
        self.alpha_mlm = alpha_mlm
        self.alpha_kd = alpha_kd
        self.alpha_cos = alpha_cos

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Ajustado para compatibilidade com vers√µes recentes do Transformers,
        que passam o argumento extra `num_items_in_batch`.
        """
        # Labels geradas pelo DataCollatorForLanguageModeling
        labels = inputs.pop("labels").to(model.device)
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)

        # ---- STUDENT FORWARD ----
        outputs_student = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        student_logits = outputs_student.logits
        student_hiddens = outputs_student.hidden_states[-1]

        # ---- MLM LOSS ----
        loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        mlm_loss = loss_fct(
            student_logits.view(-1, student_logits.size(-1)),
            labels.view(-1),
        )

        # ---- TEACHER FORWARD ---- (sem gradiente)
        with torch.no_grad():
            t_out = self.teacher(
                input_ids=input_ids,
                attention_mask=attention_mask,
                output_hidden_states=True,
            )
            teacher_logits = t_out.logits
            teacher_hiddens_raw = t_out.hidden_states[-1]

        # ---- PROJE√á√ÉO teacher -> espa√ßo do student ----
        teacher_hiddens = model.proj_teacher(teacher_hiddens_raw)

        # ---- KD LOSS ----
        T = self.temperature
        s_log_probs = F.log_softmax(student_logits / T, dim=-1)
        t_probs = F.softmax(teacher_logits / T, dim=-1)

        mask_flat = attention_mask.view(-1) == 1
        s_flat = s_log_probs.view(-1, student_logits.size(-1))[mask_flat]
        t_flat = t_probs.view(-1, teacher_logits.size(-1))[mask_flat]

        kd_loss = F.kl_div(
            s_flat,
            t_flat,
            reduction="batchmean",
        ) * (T * T)

        # ---- COSINE LOSS ----
        mask = attention_mask.unsqueeze(-1)
        mask_sum = mask.sum(dim=1).clamp(min=1e-9)
        student_mean = (student_hiddens * mask).sum(dim=1) / mask_sum
        teacher_mean = (teacher_hiddens * mask).sum(dim=1) / mask_sum

        cos_target = student_mean.new_ones(student_mean.size(0))
        cos_loss = nn.CosineEmbeddingLoss()(student_mean, teacher_mean, cos_target)

        # ---- TOTAL ----
        loss = (
            self.alpha_mlm * mlm_loss +
            self.alpha_kd  * kd_loss +
            self.alpha_cos * cos_loss
        )

        self.log({
            "loss_mlm": mlm_loss.item(),
            "loss_kd": kd_loss.item(),
            "loss_cos": cos_loss.item(),
        })

        if return_outputs:
            return loss, outputs_student
        return loss
# ============================================================
# 8. TrainingArguments (compat√≠vel com seu outro notebook)
# ============================================================

def create_training_args():
    """
    Cria TrainingArguments para o distillation training.
    Vers√£o compat√≠vel com o notebook que j√° roda (sem evaluation_strategy/save_strategy).
    """
    return TrainingArguments(
        output_dir="./bert_tiny_pt_distill",
        overwrite_output_dir=True,

        num_train_epochs=3,
        per_device_train_batch_size=8,
        learning_rate=5e-5,
        weight_decay=0.01,

        logging_steps=2000,
        save_total_limit=2,
        fp16=True,
        report_to="none",
        remove_unused_columns=False,
    )

# ============================================================
# 9. Pipeline de treinamento
# ============================================================

# 9.1 Carregar dataset brWaC filtrado e criar split
ds_brwac = load_and_filter_brwac(split=BRWAC_SLICE)
ds_train, ds_test = train_test_split_dataset(ds_brwac)

# 9.2 Criar teacher e tokenizer PT
teacher, teacher_tok = create_teacher(TEACHER_NAME)

# 9.3 Criar student (BERT-tiny) com vocabul√°rio PT e proje√ß√£o do teacher
student, student_tok = create_student(
    tiny_base_name=TINY_BASE_NAME,
    teacher=teacher,
    teacher_tok=teacher_tok,
)

# 9.4 Tokenizar datasets com tokenizer do student
ds_train_tok, ds_test_tok = tokenize_datasets(ds_train, ds_test, student_tok)

# 9.5 Criar DataCollator para MLM
data_collator = create_data_collator(student_tok)

# 9.6 TrainingArguments
training_args = create_training_args()

# 9.7 Trainer customizado (DistillTrainer)
trainer = DistillTrainer(
    model=student,
    args=training_args,
    train_dataset=ds_train_tok,
    eval_dataset=ds_test_tok,
    data_collator=data_collator,
    teacher=teacher,
    temperature=2.0,
    alpha_mlm=1.0,
    alpha_kd=0.5,
    alpha_cos=0.1,
)

# ============================================================
# 10. Treinamento e salvamento
# ============================================================
print("\nIniciando treino...\n")
trainer.train()

# üîö Salvar modelo destilado
out_dir = "./bert_tiny_pt_final_distilled"
trainer.save_model(out_dir)
student_tok.save_pretrained(out_dir)

print("Modelo salvo em:", out_dir)


Device: cuda
Carregando brWaC (train)...
Ap√≥s filtro: Dataset({
    features: ['text'],
    num_rows: 3247418
})

Criando split train/test...
Tamanho treino: 2597934
Tamanho teste : 649484


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Teacher carregado: neuralmind/bert-base-portuguese-cased
Teacher hidden size: 768
Student criado com arquitetura do: prajjwal1/bert-tiny
Student hidden size: 128
Vocab size (PT-BR): 29794
Tokenizando treino e teste...


Map:   0%|          | 0/649484 [00:00<?, ? examples/s]

Exemplo tokenizado:
{'input_ids': tensor([  101,  1602, 16560, 18878,   125, 13969, 22308,   221,  4944,   133,
          192, 22320, 22319,   135,   533,  2934, 13039,   735,   143,  1376,
          538,   532,  3232,  1564,   117,  1016,   271,  1376,   260, 18878,
        13969, 22308,   119,   133,   192, 22320, 22319,   135,   530,  3983,
         3189,   370,  3369,   532,  2465, 20561,   291,  4895,   173,  4944,
          117,   449,   346,  3189,  3598,   578,   785,   596,   122,  7030,
          311,  2650,   214,   202,  5897,   122,  6009,   214,  9603,  4944,
          120, 13969, 22308,   117,  4816,   397,   179,  3983,   121, 22325,
          230,  2389,  4419,   202, 18726, 20264, 12987,  9218, 22281,   113,
         7343, 14972,   114,   291,  7167, 13065,  9218, 22281,   119,  5653,
          117,   179,   376,  1821,  1117,  1529,   125,  2465,   122,  3206,
          125,  2075,  6176,   119,   133,   192, 22320, 22319,   135,  4534,
          328,   179, 18878, 1

Step,Training Loss
100,25.8394
200,24.4093
300,23.1992
400,22.1629
500,21.421
600,20.5718
700,19.9999
800,19.5655
900,19.2691
1000,18.9452


In [None]:
# ============================================================
#  Avalia√ß√£o MLM: BERT-tiny-PT distilado vs Tiny EN vs BERTimbau
# ============================================================

from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
)
from torch.utils.data import DataLoader
import torch
import re

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================
# 0) Garante que ds_test existe e tem coluna 'text'
# ============================================================
assert "ds_test" in globals(), "ds_test n√£o encontrado: rode antes o c√≥digo de treino que cria ds_test."
assert "text" in ds_test.column_names, "ds_test precisa ter a coluna 'text'."


# ============================================================
#  Fun√ß√£o para avaliar MLM loss (por token mascarado)
# ============================================================
def evaluate_mlm_loss(model, dataloader):
    model.eval()
    total_loss, total_tokens = 0.0, 0

    # device real do modelo (pega do primeiro par√¢metro)
    model_device = next(model.parameters()).device

    with torch.no_grad():
        for batch in dataloader:
            # move tensores para o device do modelo
            batch = {
                k: v.to(model_device) if isinstance(v, torch.Tensor) else v
                for k, v in batch.items()
            }

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],   # labels gerados pelo collator MLM
            )

            # conta s√≥ tokens mascarados (labels != -100)
            mask = batch["labels"] != -100
            num_masked = mask.sum().item()

            if num_masked == 0:
                continue

            # outputs.loss j√° √© m√©dia por token mascarado no batch
            total_loss += outputs.loss.item() * num_masked
            total_tokens += num_masked

    return total_loss / total_tokens if total_tokens > 0 else float("nan")


# ============================================================
# Monta DataLoader de MLM para qualquer modelo/tokenizer
# ============================================================
def build_mlm_loader(
    ds,
    tokenizer,
    max_length: int = 128,
    batch_size: int = 8,
    text_clean_fn=None,
):
    """
    - ds: Dataset com coluna 'text'
    - tokenizer: tokenizer do modelo a ser avaliado
    - text_clean_fn: fun√ß√£o opcional para limpar o texto antes de tokenizar
    """
    def tokenize_fn(batch):
        if text_clean_fn is not None:
            textos = [text_clean_fn(t) for t in batch["text"]]
        else:
            textos = batch["text"]

        return tokenizer(
            textos,
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )

    ds_tok = ds.map(tokenize_fn, batched=True)

    collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=0.15,
    )

    loader = DataLoader(
        ds_tok.remove_columns(
            [c for c in ds_tok.column_names if c not in ["input_ids", "attention_mask"]]
        ),
        batch_size=batch_size,
        collate_fn=collator,
    )
    return loader


# ============================================================
# 1) AVALIA√á√ÉO ‚Äî BERT-tiny-PT distilado (seu modelo novo)
# ============================================================
print("\nAvaliando modelo distilado (Portugu√™s)...")

student_dir = "./bert_tiny_pt_final_distilled"

student = AutoModelForMaskedLM.from_pretrained(student_dir).to(device)
student_tok = AutoTokenizer.from_pretrained(student_dir)

loader_student = build_mlm_loader(ds_test, student_tok)

loss_pt = evaluate_mlm_loss(student, loader_student)
print(f"BERT-tiny-PT (distilado): MLM loss = {loss_pt:.4f}")


# ============================================================
# 2) AVALIA√á√ÉO ‚Äî BERT-tiny original (ingl√™s)
# ============================================================
print("\nAvaliando modelo original (ingl√™s)...")

tok_tiny_orig = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
tiny_orig = AutoModelForMaskedLM.from_pretrained("prajjwal1/bert-tiny").to(device)

loader_tiny_orig = build_mlm_loader(ds_test, tok_tiny_orig)

loss_orig = evaluate_mlm_loss(tiny_orig, loader_tiny_orig)
print(f"BERT-tiny original: MLM loss = {loss_orig:.4f}")


# ============================================================
# 3) AVALIA√á√ÉO ‚Äî BERTimbau-base
# ============================================================
print("\nAvaliando BERTimbau-base (teacher)...")

tok_imbau = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

def clean_for_imbau(text: str) -> str:
    # tira tags tipo <END>, <DOC>, HTML etc. pra n√£o zoar o tokenizer
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text if text else "."

bertimbau = AutoModelForMaskedLM.from_pretrained(
    "neuralmind/bert-base-portuguese-cased"
).to(device)

loader_imbau = build_mlm_loader(
    ds_test,
    tok_imbau,
    text_clean_fn=clean_for_imbau,
)

loss_imbau = evaluate_mlm_loss(bertimbau, loader_imbau)
print(f"BERTimbau-base: MLM loss = {loss_imbau:.4f}")


# ============================================================
# 4) Compara√ß√£o Final
# ============================================================
print("\n====== Compara√ß√£o Final ======")
print(f"BERT-tiny-PT (distilado): {loss_pt:.4f}")
print(f"BERT-tiny original      : {loss_orig:.4f}")
print(f"BERTimbau-base          : {loss_imbau:.4f}")

print(f"\nGanho tiny-PT vs tiny original: {loss_orig - loss_pt:.4f}")
print(f"Ganho tiny-PT vs BERTimbau   : {loss_imbau - loss_pt:.4f}")


Some weights of the model checkpoint at ./bert_tiny_pt_final_distilled were not used when initializing BertForMaskedLM: ['proj_teacher.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Device: cuda

Avaliando modelo distilado (Portugu√™s)...
BERT-tiny-PT (distilado): MLM loss = 3.3025

Avaliando modelo original (ingl√™s)...
BERT-tiny original: MLM loss = 5.7496

Avaliando BERTimbau-base (teacher)...


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTimbau-base: MLM loss = 1.7373

BERT-tiny-PT (distilado): 3.3025
BERT-tiny original      : 5.7496
BERTimbau-base          : 1.7373

Ganho tiny-PT vs tiny original: 2.4471
Ganho tiny-PT vs BERTimbau   : -1.5653


# aqui

# Twetter}

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

# ============================================================
# 0) Device
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================
# 1) Carrega TweetSentBR Few-shot (~2.000 exemplos)
# ============================================================
tweet = load_dataset("eduagarcia/tweetsentbr_fewshot")
tweet = tweet["train"].train_test_split(test_size=0.2, seed=42)
train_ds, test_ds = tweet["train"], tweet["test"]

print(f"Tamanho ‚Üí treino: {len(train_ds)} | teste: {len(test_ds)}")
print(train_ds[0])

# ============================================================
# 2) Fun√ß√£o de tokeniza√ß√£o
# ============================================================
def tokenize_fn(batch, tokenizer, max_length: int = 128):
    return tokenizer(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

# ============================================================
# 3) Tokenizers
# ============================================================
# Caminho do Tiny-PT distilado novo (seu modelo destilado)
model_path_pt = "./bert_tiny_pt_final_distilled"

# Student distilado PT
tok_pt = AutoTokenizer.from_pretrained(model_path_pt)

# Tiny original (ingl√™s)
tok_en = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

# BERTimbau
tok_imbau = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

# ============================================================
# 4) Tokeniza train/test para cada modelo
# ============================================================
# Tiny-PT distilado
train_tok_pt = train_ds.map(lambda x: tokenize_fn(x, tok_pt), batched=True)
test_tok_pt  = test_ds.map(lambda x: tokenize_fn(x, tok_pt), batched=True)

# Tiny original (ingl√™s)
train_tok_en = train_ds.map(lambda x: tokenize_fn(x, tok_en), batched=True)
test_tok_en  = test_ds.map(lambda x: tokenize_fn(x, tok_en), batched=True)

# BERTimbau
train_tok_imbau = train_ds.map(lambda x: tokenize_fn(x, tok_imbau), batched=True)
test_tok_imbau  = test_ds.map(lambda x: tokenize_fn(x, tok_imbau), batched=True)

# ============================================================
# 5) Normaliza labels
# ============================================================
label2id = {"Negative": 0, "Neutral": 1, "Positive": 2}
id2label = {v: k for k, v in label2id.items()}

def encode_labels(example):
    return {"labels": label2id[example["label"]]}

train_tok_pt = train_tok_pt.map(encode_labels)
test_tok_pt  = test_tok_pt.map(encode_labels)

train_tok_en = train_tok_en.map(encode_labels)
test_tok_en  = test_tok_en.map(encode_labels)

train_tok_imbau = train_tok_imbau.map(encode_labels)
test_tok_imbau  = test_tok_imbau.map(encode_labels)

# ============================================================
# 6) Remove colunas extras
# ============================================================
cols_to_keep = ["input_ids", "attention_mask", "labels"]

def strip_columns(ds):
    cols_to_remove = [c for c in ds.column_names if c not in cols_to_keep]
    return ds.remove_columns(cols_to_remove)

train_tok_pt = strip_columns(train_tok_pt)
test_tok_pt  = strip_columns(test_tok_pt)

train_tok_en = strip_columns(train_tok_en)
test_tok_en  = strip_columns(test_tok_en)

train_tok_imbau = strip_columns(train_tok_imbau)
test_tok_imbau  = strip_columns(test_tok_imbau)

# ============================================================
# 7) M√©tricas
# ============================================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}

# ============================================================
# 8) Helper para treinar + avaliar um modelo
# ============================================================
def train_and_eval(model_name, tokenizer, train_ds, test_ds, output_dir):
    print(f"\n===== Treinando e avaliando: {model_name} =====")

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
    ).to(device)

    # Opcional: guardar mapeamento de labels no config (s√≥ ajuda em logs)
    model.config.label2id = label2id
    model.config.id2label = id2label
    model.config.problem_type = "single_label_classification"

    args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=100,
        save_total_limit=1,
        report_to="none",    # desativa W&B / TensorBoard
        # remove_unused_columns=True √© o default e ok,
        # pois j√° deixamos s√≥ input_ids / attention_mask / labels
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    )

    trainer.train()
    metrics = trainer.evaluate()
    print("M√©tricas:", metrics)
    return metrics

# ============================================================
# 9) Tiny-PT distilado (Portugu√™s)
# ============================================================
metrics_pt = train_and_eval(
    model_name=model_path_pt,
    tokenizer=tok_pt,
    train_ds=train_tok_pt,
    test_ds=test_tok_pt,
    output_dir="./eval_tiny_pt_distilled",
)

# ============================================================
# 10) Tiny original (ingl√™s)
# ============================================================
metrics_en = train_and_eval(
    model_name="prajjwal1/bert-tiny",
    tokenizer=tok_en,
    train_ds=train_tok_en,
    test_ds=test_tok_en,
    output_dir="./eval_tiny_en",
)

# ============================================================
# 11) BERTimbau-base
# ============================================================
metrics_imbau = train_and_eval(
    model_name="neuralmind/bert-base-portuguese-cased",
    tokenizer=tok_imbau,
    train_ds=train_tok_imbau,
    test_ds=test_tok_imbau,
    output_dir="./eval_bertimbau",
)

# ============================================================
# 12) Compara√ß√£o Final
# ============================================================
print("\n====== COMPARA√á√ÉO FINAL (F1) ======")
print(f"F1 Tiny-PT (distilado): {metrics_pt['eval_f1']:.4f}")
print(f"F1 Tiny original       : {metrics_en['eval_f1']:.4f}")
print(f"F1 BERTimbau           : {metrics_imbau['eval_f1']:.4f}")

print("\n====== GANHOS ======")
print(f"Ganho Tiny-PT vs Tiny original : {metrics_pt['eval_f1'] - metrics_en['eval_f1']:.4f}")
print(f"Ganho Tiny-PT vs BERTimbau     : {metrics_pt['eval_f1'] - metrics_imbau['eval_f1']:.4f}")


Device: cuda
Tamanho ‚Üí treino: 60 | teste: 15
{'id': 76, 'sentence': 'se o lindo USERNAME sair eu nem sei viu', 'label': 'Positive'}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./bert_tiny_pt_final_distilled and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Treinando e avaliando: ./bert_tiny_pt_final_distilled =====


  trainer = Trainer(


Step,Training Loss


M√©tricas: {'eval_loss': 1.1071614027023315, 'eval_accuracy': 0.4, 'eval_f1': 0.19999999999999998, 'eval_runtime': 0.0051, 'eval_samples_per_second': 2948.061, 'eval_steps_per_second': 196.537, 'epoch': 3.0}

===== Treinando e avaliando: prajjwal1/bert-tiny =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


M√©tricas: {'eval_loss': 1.0790365934371948, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.35833333333333334, 'eval_runtime': 0.0055, 'eval_samples_per_second': 2747.361, 'eval_steps_per_second': 183.157, 'epoch': 3.0}

===== Treinando e avaliando: neuralmind/bert-base-portuguese-cased =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss


M√©tricas: {'eval_loss': 1.0207682847976685, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.16666666666666666, 'eval_runtime': 0.0141, 'eval_samples_per_second': 1062.62, 'eval_steps_per_second': 70.841, 'epoch': 3.0}

F1 Tiny-PT (distilado): 0.2000
F1 Tiny original       : 0.3583
F1 BERTimbau           : 0.1667

Ganho Tiny-PT vs Tiny original : -0.1583
Ganho Tiny-PT vs BERTimbau     : 0.0333


# Igorar essa parte de baixo

# Avalia√ß√£o Similaridade Textual (STS) do ASSIN2.

In [None]:
# ============================================================
#  Avalia√ß√£o STS (ASSIN2) ‚Äì Pearson correlation
# ============================================================

import torch
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from scipy.stats import pearsonr
import numpy as np

# ============================================================
# 1) Dataset ASSIN2
# ============================================================
assin = load_dataset("assin2")
test_assin = assin["test"]

print("Exemplo:", test_assin[0])

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================
# 2) Fun√ß√£o auxiliar ‚Äî embedding m√©dio
# ============================================================
def embed_sentence(model, tokenizer, sentence, max_length: int = 128):
    """
    Retorna o embedding m√©dio (mean pooling) da √∫ltima camada do modelo.
    """
    enc = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    ).to(model.device)

    with torch.no_grad():
        out = model(**enc, output_hidden_states=False)

    # mean pooling da √∫ltima camada (dimens√£o [1, hidden])
    emb = out.last_hidden_state.mean(dim=1)
    return emb.squeeze(0)


# ============================================================
# 3) Fun√ß√£o de avalia√ß√£o ‚Äî correla√ß√£o de Pearson
# ============================================================
def evaluate_sts(model, tokenizer, dataset):
    """
    Calcula a correla√ß√£o de Pearson entre similaridade de cosseno
    (entre embeddings das senten√ßas) e o score humano de similaridade.
    """
    preds, gold = [], []

    model.eval()
    for item in dataset:
        s1 = item["premise"]
        s2 = item["hypothesis"]

        emb1 = embed_sentence(model, tokenizer, s1)
        emb2 = embed_sentence(model, tokenizer, s2)

        # Similaridade de cosseno entre as duas frases
        sim = torch.nn.functional.cosine_similarity(emb1, emb2, dim=0).item()
        preds.append(sim)
        gold.append(item["relatedness_score"])

    pear = pearsonr(preds, gold)[0]
    return pear


# ============================================================
# 4) BERT-tiny-PT (distilado)
# ============================================================
print("\nAvaliando BERT-tiny-PT (distilado)...")

model_path_pt = "./bert_tiny_pt_final_distilled"  # ou v5_token_bert, se for essa vers√£o

tok_pt = AutoTokenizer.from_pretrained(model_path_pt)
model_pt = AutoModel.from_pretrained(model_path_pt).to(device)

pearson_pt = evaluate_sts(model_pt, tok_pt, test_assin)
print(f"BERT-tiny-PT (distilado) ‚Üí Pearson = {pearson_pt:.4f}")


# ============================================================
# 5) BERT-tiny original (ingl√™s)
# ============================================================
print("\nAvaliando BERT-tiny original (ingl√™s)...")

tok_en = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model_en = AutoModel.from_pretrained("prajjwal1/bert-tiny").to(device)

pearson_en = evaluate_sts(model_en, tok_en, test_assin)
print(f"BERT-tiny original (ingl√™s) ‚Üí Pearson = {pearson_en:.4f}")


# ============================================================
# 6) BERTimbau-base (teacher)
# ============================================================
print("\nAvaliando BERTimbau-base (teacher)...")

tok_imbau = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
model_imbau = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased").to(device)

pearson_imbau = evaluate_sts(model_imbau, tok_imbau, test_assin)
print(f"BERTimbau-base ‚Üí Pearson = {pearson_imbau:.4f}")


# ============================================================
# 7) Compara√ß√£o final
# ============================================================
print("\n=========== COMPARA√á√ÉO ASSIN2 (STS ‚Äì Pearson) ===========")
print(f"BERTimbau-base          : {pearson_imbau:.4f}")
print(f"BERT-tiny-PT (distilado): {pearson_pt:.4f}")
print(f"BERT-tiny original      : {pearson_en:.4f}")
print("=========================================================")


Some weights of BertModel were not initialized from the model checkpoint at ./bert_tiny_pt_final_distilled and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Exemplo: {'sentence_pair_id': 0, 'premise': 'O cachorro caramelo est√° assistindo um cachorro castanho que est√° nadando em uma lagoa', 'hypothesis': 'Um cachorro de estima√ß√£o est√° de p√© no banco e est√° olhando outro cachorro, que √© castanho, na lagoa', 'relatedness_score': 3.799999952316284, 'entailment_judgment': 0}
Device: cuda

Avaliando BERT-tiny-PT (distilado)...
BERT-tiny-PT (distilado) ‚Üí Pearson = 0.4400

Avaliando BERT-tiny original (ingl√™s)...
BERT-tiny original (ingl√™s) ‚Üí Pearson = 0.5262

Avaliando BERTimbau-base (teacher)...
BERTimbau-base ‚Üí Pearson = 0.6139

BERTimbau-base          : 0.6139
BERT-tiny-PT (distilado): 0.4400
BERT-tiny original      : 0.5262


# CLASSIFICA√á√ÉO: ‚ÄúIGUAL / CONTRA / NEUTRA‚Äù (RTE) no ASSIN2

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

# ============================================================
# 0) Device
# ============================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================
# 1) Carregar ASSIN2 (tarefa RTE)
# ============================================================
assin = load_dataset("assin2")
train = assin["train"]
test  = assin["validation"]  # valida√ß√£o = teste oficial do benchmark

print("Exemplo treino:", train[0])

# Labels da tarefa (coment√°rio: dataset j√° traz 0/1/2 em 'entailment_judgment')
label2id = {"entailment": 0, "contradiction": 1, "neutral": 2}
id2label = {v: k for k, v in label2id.items()}

def encode_labels(example):
    # se o dataset j√° tiver 0/1/2 em 'entailment_judgment'
    return {"labels": int(example["entailment_judgment"])}

# Fun√ß√£o de tokeniza√ß√£o (premissa + hip√≥tese)
def tokenize_batch(batch, tokenizer, max_length: int = 128):
    return tokenizer(
        batch["premise"],
        batch["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
    )

# ============================================================
# M√©tricas (accuracy e F1 macro)
# ============================================================
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }

# ============================================================
# Helper pra tokenizar e preparar dataset
# ============================================================
def prepare_dataset(tokenizer):
    train_tok = train.map(lambda x: tokenize_batch(x, tokenizer), batched=True)
    test_tok  = test.map(lambda x: tokenize_batch(x, tokenizer), batched=True)

    train_tok = train_tok.map(encode_labels)
    test_tok  = test_tok.map(encode_labels)

    cols_keep = ["input_ids", "attention_mask", "labels"]
    cols_remove = [c for c in train_tok.column_names if c not in cols_keep]

    train_tok = train_tok.remove_columns(cols_remove)
    test_tok  = test_tok.remove_columns(cols_remove)
    return train_tok, test_tok

# ============================================================
# 2) BERTimbau (professor)
# ============================================================
tok_imbau = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
train_tok_imbau, test_tok_imbau = prepare_dataset(tok_imbau)

model_imbau = AutoModelForSequenceClassification.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
).to(device)

args_imbau = TrainingArguments(
    output_dir="./eval_imbau_rte",
    overwrite_output_dir=True,
    per_device_eval_batch_size=16,
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
)

trainer_imbau = Trainer(
    model=model_imbau,
    args=args_imbau,
    eval_dataset=test_tok_imbau,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tok_imbau),
)

metrics_imbau = trainer_imbau.evaluate()
print("\nBERTimbau-base (RTE):", metrics_imbau)

# ============================================================
# 3) BERT-tiny original (ingl√™s)
# ============================================================
tok_tiny_en = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
train_tok_en, test_tok_en = prepare_dataset(tok_tiny_en)

model_tiny_en = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
).to(device)

args_en = TrainingArguments(
    output_dir="./eval_tiny_en_rte",
    overwrite_output_dir=True,
    per_device_eval_batch_size=16,
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
)

trainer_en = Trainer(
    model=model_tiny_en,
    args=args_en,
    eval_dataset=test_tok_en,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tok_tiny_en),
)

metrics_tiny_en = trainer_en.evaluate()
print("\nBERT-tiny original (RTE):", metrics_tiny_en)

# ============================================================
# 4) BERT-tiny-PT (distilado em PT com token BERTimbau)
# ============================================================
# Ajuste aqui se seu path final for outro (ex.: "./bert_tiny_pt_final_distilled")
MODEL_PT_PATH = "bert_tiny_pt_final_distilled"

tok_tiny_pt = AutoTokenizer.from_pretrained(MODEL_PT_PATH)
train_tok_pt, test_tok_pt = prepare_dataset(tok_tiny_pt)

model_tiny_pt = AutoModelForSequenceClassification.from_pretrained(
    MODEL_PT_PATH,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
).to(device)

args_pt = TrainingArguments(
    output_dir="./eval_tiny_pt_rte",
    overwrite_output_dir=True,
    per_device_eval_batch_size=16,
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
)

trainer_pt = Trainer(
    model=model_tiny_pt,
    args=args_pt,
    eval_dataset=test_tok_pt,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tok_tiny_pt),
)

metrics_tiny_pt = trainer_pt.evaluate()
print("\nBERT-tiny-PT (RTE):", metrics_tiny_pt)

# ============================================================
# 5) Compara√ß√£o final
# ============================================================
print("\n========== RESULTADOS ASSIN2 (Entailment / RTE) ==========")
print("BERTimbau-base     :", metrics_imbau)
print("BERT-tiny original :", metrics_tiny_en)
print("BERT-tiny-PT       :", metrics_tiny_pt)


Device: cuda
Exemplo treino: {'sentence_pair_id': 1, 'premise': 'Uma crian√ßa risonha est√° segurando uma pistola de √°gua e sendo espirrada com √°gua', 'hypothesis': 'Uma crian√ßa est√° segurando uma pistola de √°gua', 'relatedness_score': 4.5, 'entailment_judgment': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERTimbau-base (RTE): {'eval_loss': 1.2766132354736328, 'eval_model_preparation_time': 0.0044, 'eval_accuracy': 0.01, 'eval_f1': 0.012484394506866418, 'eval_runtime': 0.2784, 'eval_samples_per_second': 1795.833, 'eval_steps_per_second': 114.933}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert_tiny_pt_final_distilled and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



BERT-tiny original (RTE): {'eval_loss': 1.1525009870529175, 'eval_model_preparation_time': 0.0003, 'eval_accuracy': 0.478, 'eval_f1': 0.21826484018264838, 'eval_runtime': 0.0815, 'eval_samples_per_second': 6132.239, 'eval_steps_per_second': 392.463}



BERT-tiny-PT (RTE): {'eval_loss': 1.103992223739624, 'eval_model_preparation_time': 0.0003, 'eval_accuracy': 0.5, 'eval_f1': 0.22251891410769917, 'eval_runtime': 0.0829, 'eval_samples_per_second': 6028.932, 'eval_steps_per_second': 385.852}

BERTimbau-base     : {'eval_loss': 1.2766132354736328, 'eval_model_preparation_time': 0.0044, 'eval_accuracy': 0.01, 'eval_f1': 0.012484394506866418, 'eval_runtime': 0.2784, 'eval_samples_per_second': 1795.833, 'eval_steps_per_second': 114.933}
BERT-tiny original : {'eval_loss': 1.1525009870529175, 'eval_model_preparation_time': 0.0003, 'eval_accuracy': 0.478, 'eval_f1': 0.21826484018264838, 'eval_runtime': 0.0815, 'eval_samples_per_second': 6132.239, 'eval_steps_per_second': 392.463}
BERT-tiny-PT       : {'eval_loss': 1.103992223739624, 'eval_model_preparation_time': 0.0003, 'eval_accuracy': 0.5, 'eval_f1': 0.22251891410769917, 'eval_runtime': 0.0829, 'eval_samples_per_second': 6028.932, 'eval_steps_per_second': 385.852}
