In [6]:
# ─────────────────────────────
# 1. Standard library (já vem com o Python)
# ─────────────────────────────
import json
from pathlib import Path
import itertools  # se for gerar folds manualmente, por ex.
import random  # sementes

# ─────────────────────────────
# 2. Terceiros – instalação via pip/venv
# ─────────────────────────────
# ↳ dados
import pandas as pd  # leitura CSV
from datasets import Dataset, DatasetDict, load_dataset

# ↳ modelagem e métricas
# import numpy as np
# import torch
# from transformers import (
#     AutoTokenizer,
#     AutoModelForTokenClassification,
#     Trainer,
#     TrainingArguments,
# )
# from seqeval.metrics import f1_score  # F1 para NER
# from sklearn.model_selection import KFold, train_test_split

# # (opcional) tracking e visualização
# import mlflow  # ou tensorboard
# import matplotlib.pyplot as plt

In [37]:
from typing import List, Tuple
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import BallTree

from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
)
from seqeval.metrics import f1_score

In [24]:
csv_path = "data/cachacaNER.csv"

In [25]:
df = pd.read_csv(CSV_PATH)

# a) identificar coluna de sentença
for cand in ["sentence_id", "sentence", "sent_id"]:
    if cand in df.columns:
        SENT_COL = cand
        break
else:
    raise KeyError(
        "Não encontrei coluna de ID de sentença. "
        f"Colunas disponíveis: {list(df.columns)}"
    )

In [26]:
SENT_COL

'sentence'

In [28]:
def sent_to_record(sent_id, group):
    return {
        f"{SENT_COL}": sent_id,  # 👈  agora fica disponível
        "tokens": group["token"].tolist(),
        "ner_tags": group["tag"].tolist(),
    }


records_cachaca = [
    sent_to_record(sent_id, grp) for sent_id, grp in df.groupby(SENT_COL, sort=False)
]
cachaca_full = Dataset.from_list(records_cachaca)

In [29]:
cachaca_full

Dataset({
    features: ['sentence', 'tokens', 'ner_tags'],
    num_rows: 13628
})

In [30]:
records_cachaca[0]

{'sentence': 130,
 'tokens': ['NOME',
  'DA',
  'CACHAÇA',
  ':',
  'Porto',
  'Estrela',
  'Ouro',
  '1',
  'Litro'],
 'ner_tags': ['O',
  'O',
  'O',
  'O',
  'B-NOME_BEBIDA',
  'I-NOME_BEBIDA',
  'B-CLASSIFICACAO_BEBIDA',
  'B-VOLUME',
  'I-VOLUME']}

In [23]:
SENT_COL

'sentence'

In [31]:
if "trainingTest" in df.columns:
    train_idx = df[df["trainingTest"] == "training"][SENT_COL].unique()
    test_idx = df[df["trainingTest"] == "test"][SENT_COL].unique()

    cachaca_train = cachaca_full.select(
        [i for i, rec in enumerate(records_cachaca) if rec[SENT_COL] in train_idx]
    )
    cachaca_test = cachaca_full.select(
        [i for i, rec in enumerate(records_cachaca) if rec[SENT_COL] in test_idx]
    )
else:
    # fallback: 80/20 aleatório
    cachaca_train, cachaca_test = cachaca_full.train_test_split(
        test_size=0.2, seed=42
    ).values()

In [32]:
cachaca_test

Dataset({
    features: ['sentence', 'tokens', 'ner_tags'],
    num_rows: 2847
})

In [33]:
cachaca_train

Dataset({
    features: ['sentence', 'tokens', 'ner_tags'],
    num_rows: 9454
})

In [None]:
def random_splits(
    ds: Dataset, test_size=0.1, dev_size=0.1, seeds: List[int] = range(30)
) -> List[DatasetDict]:
    """Retorna 30 divisões aleatórias train/dev/test."""
    triples = []
    for s in seeds:
        train_tmp, test = ds.train_test_split(test_size=test_size, seed=s).values()
        train, dev = train_tmp.train_test_split(
            test_size=dev_size / (1 - test_size), seed=s
        ).values()
        triples.append(DatasetDict(train=train, dev=dev, test=test))
    return triples

In [None]:
# Heurística 1 ▸ comprimento
lengths = np.array([len(t) for t in ds["tokens"]])
thr = np.percentile(lengths, 95)
mask_test = lengths >= thr
heur_len = DatasetDict(
    train=ds.filter(~mask_test),
    dev=ds.filter(mask_test),  # ← dev==test aqui para manter 80/20?
)

# Heurística 2 ▸ palavra rara

flat = [w.lower() for sent in ds["tokens"] for w in sent]
rare = {w for w, c in Counter(flat).items() if c <= 5}


def has_rare(example):
    return any(w.lower() in rare for w in example["tokens"])


heur_rare = DatasetDict(
    train=ds.filter(lambda ex: not has_rare(ex)), dev=ds.filter(has_rare)
)

In [None]:
def adversarial_split(ds: Dataset, k: int = int(0.1 * len(ds))) -> DatasetDict:
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    embeds = model.encode(
        [" ".join(toks) for toks in ds["tokens"]], show_progress_bar=True
    )
    tree = BallTree(embeds)
    idx_train, idx_test = set(range(len(ds))), []

    # pega o ponto mais central como semente do teste
    idx = np.argmax(np.linalg.norm(embeds - embeds.mean(0), axis=1))
    idx_train.remove(idx)
    idx_test.append(idx)

    while len(idx_test) < k:
        # distância mínima para qualquer já selecionado
        dists, _ = tree.query(embeds[list(idx_train)], k=1, return_distance=True)
        nxt = list(idx_train)[int(np.argmax(dists))]
        idx_train.remove(nxt)
        idx_test.append(nxt)

    return DatasetDict(
        train=ds.select(sorted(idx_train)),
        dev=ds.select(sorted(idx_test)),  # dev==test de propósito (paper)
    )

In [None]:
MODEL = "neuralmind/bert-base-portuguese-cased"

In [None]:
def tokenize(batch):
    tok = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True)
    # alinhar labels → exercício já feito antes
    ...


def compute_metrics(p):
    logits, labels = p
    preds = np.argmax(logits, -1)
    # converter ids->tags, remover padding
    ...
    return {"f1": f1_score(true_tags, pred_tags)}


def run_experiment(dsdict: DatasetDict, seed: int):
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForTokenClassification.from_pretrained(
        MODEL, num_labels=NUM_LABELS
    )

    encoded = dsdict.map(tokenize, batched=True)
    args = TrainingArguments(
        output_dir=f"runs/seed{seed}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        num_train_epochs=5,
        seed=seed,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=encoded["train"],
        eval_dataset=encoded.get("tune", encoded["dev"]),  # usa tune se existir
        compute_metrics=compute_metrics,
    )
    trainer.train()
    return trainer.evaluate(encoded["dev"])["eval_f1"]

In [None]:
### 5.1  standard
std_f1 = run_experiment(standard_split, seed=0)

### 5.2  30 random seeds
rand_scores = [
    run_experiment(ds, s) for s, ds in enumerate(random_splits(cachaca_full))
]

### 5.3  heuristic + adversarial
scores_heur_len = run_experiment(heur_len, 111)
scores_heur_rare = run_experiment(heur_rare, 222)
scores_advers = run_experiment(adversarial_split(cachaca_full), 333)

### 5.4  tune-4-way
tune_scores = run_experiment(tune_split(cachaca_full), 42)

In [None]:
import statistics as st

print(f"Standard: {std_f1:.3f}")
print(f"Random   mean={st.mean(rand_scores):.3f}  sd={st.stdev(rand_scores):.3f}")
print(
    f"Heur-len {scores_heur_len:.3f} • Heur-rare {scores_heur_rare:.3f} • Advers {scores_advers:.3f}"
)
print(f"Tune-split dev F1 {tune_scores:.3f}")