In [1]:
# ─────────────────────────────
# 1. Standard library (já vem com o Python)
# ─────────────────────────────
import json
from pathlib import Path
import itertools  # se for gerar folds manualmente, por ex.
import random  # sementes

# ─────────────────────────────
# 2. Terceiros – instalação via pip/venv
# ─────────────────────────────
# ↳ dados
import pandas as pd  # leitura CSV
from datasets import Dataset, DatasetDict, load_dataset

# ↳ modelagem e métricas
import numpy as np
# import torch
# from transformers import (
#     AutoTokenizer,
#     AutoModelForTokenClassification,
#     Trainer,
#     TrainingArguments,
# )
# from seqeval.metrics import f1_score  # F1 para NER
# from sklearn.model_selection import KFold, train_test_split

from sklearn.model_selection import KFold

# # (opcional) tracking e visualização
# import mlflow  # ou tensorboard
# import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CLEAN_PATH = Path("CleanCoNLL/data/cleanconll_annotations")

In [6]:
def load_conll(path: Path) -> Dataset:
    """
    Lê um arquivo CoNLL (4 colunas) e devolve Dataset {tokens, ner_tags}.
    """
    sents, labels = [], []
    toks, tags = [], []

    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")

            # 1) sentença terminou → salva e zera buffers
            if not line:
                if toks:  # evita linhas duplas
                    sents.append(toks)
                    labels.append(tags)
                    toks, tags = [], []
                continue

            # 2) comentários do flair começam com '#'
            if line.startswith("#"):
                continue

            # 3) CoNLL: token POS CHUNK NER
            parts = line.split()
            tok, ner = parts[0], parts[-1]  # pega 1ª e última coluna
            toks.append(tok)
            tags.append(ner)

    # adiciona a última sentença se o arquivo não termina em blank line
    if toks:
        sents.append(toks)
        labels.append(tags)

    return Dataset.from_dict({"tokens": sents, "ner_tags": labels})

In [7]:
clean_train = load_conll(CLEAN_PATH / "cleanconll_annotations.train")
clean_dev = load_conll(CLEAN_PATH / "cleanconll_annotations.dev")  # opcional
clean_test = load_conll(CLEAN_PATH / "cleanconll_annotations.test")  # opcional

In [8]:
clean_train

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 14903
})

In [9]:
clean_dev

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 3449
})

In [10]:
clean_train

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 14903
})