In [20]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
%%capture

!pip -q install transformers datasets evaluate accelerate

In [2]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
import evaluate
import numpy as np

# Carregar JSONL diretamente

In [3]:
data_files = {
    "train": "/content/treino.jsonl",  # ajuste o caminho se necessário
    "test":  "/content/teste.jsonl",
}
ds = load_dataset("json", data_files=data_files, split={"train":"train", "test":"test"})

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 500
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 100
    })
})

# Mapear rótulos string -> ids (0/1)


In [5]:
label_names = ["suporte", "venda"]
label2id = {lbl:i for i,lbl in enumerate(label_names)}
id2label = {i:lbl for lbl,i in label2id.items()}

def map_labels(example):
    example["label"] = label2id[example["completion"]]
    example["text"] = example["prompt"]
    return example

ds = {k: v.map(map_labels, remove_columns=[c for c in ds[k].column_names if c not in ["text","label"]]) for k, v in ds.items()}

# Tokenizer + modelo (Português)

In [6]:
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)
tokenized = {k: v.map(tokenize, batched=True) for k, v in ds.items()}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# Modelo

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Treinamento


In [8]:
from transformers import EarlyStoppingCallback

In [9]:
data_collator = DataCollatorWithPadding(tokenizer)
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": metric_accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": metric_f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "precision": metric_precision.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall": metric_recall.compute(predictions=preds, references=labels, average="macro")["recall"],
    }

In [13]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.55.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.55.1-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m113.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.55.0
    Uninstalling transformers-4.55.0:
      Successfully uninstalled transformers-4.55.0
Successfully installed transformers-4.55.1


In [13]:
training_args = TrainingArguments(
    output_dir="/content/bert-suporte-venda-pt",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,

    # logging
    logging_strategy="steps",
    logging_steps=100,
    report_to="none",

    # avaliação e salvamento por steps
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,

    # melhor modelo no fim
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    seed=42,
)

In [15]:
trainer = Trainer (
    model,
    training_args,
    train_dataset = tokenized['train'],
    eval_dataset = tokenized['test'],
    data_collator = data_collator,
    compute_metrics = compute_metrics,
    tokenizer = tokenizer
)

  trainer = Trainer (


In [16]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.0002,0.000116,1.0,1.0,1.0,1.0


TrainOutput(global_step=315, training_loss=0.022022093884972116, metrics={'train_runtime': 925.8261, 'train_samples_per_second': 2.7, 'train_steps_per_second': 0.34, 'total_flos': 21788884272000.0, 'train_loss': 0.022022093884972116, 'epoch': 5.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.00011568469199119136,
 'eval_accuracy': 1.0,
 'eval_f1': 1.0,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_runtime': 9.0664,
 'eval_samples_per_second': 11.03,
 'eval_steps_per_second': 1.434,
 'epoch': 5.0}

In [18]:
trainer.save_model()


In [21]:
trainer.push_to_hub("RobertaFortes/modeldptoclassification")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...-suporte-venda-pt/model.safetensors:   3%|3         | 13.4MB /  436MB            

  ...-suporte-venda-pt/training_args.bin:  12%|#1        |   617B / 5.30kB            

CommitInfo(commit_url='https://huggingface.co/RobertaFortes/bert-suporte-venda-pt/commit/fa29f09c044399e7c77a2aa923ec0077ecdd0131', commit_message='RobertaFortes/modeldptoclassification', commit_description='', oid='fa29f09c044399e7c77a2aa923ec0077ecdd0131', pr_url=None, repo_url=RepoUrl('https://huggingface.co/RobertaFortes/bert-suporte-venda-pt', endpoint='https://huggingface.co', repo_type='model', repo_id='RobertaFortes/bert-suporte-venda-pt'), pr_revision=None, pr_num=None)

In [22]:
!git clone https://github.com/RobertaFortes/bert-dpto-classification.git

Cloning into 'bert-dpto-classification'...


In [23]:
%cd bert-dpto-classification

/content/bert-dpto-classification


In [24]:
!git config --global user.email "betanickf@gmail.com"

In [25]:
!git config --global user.name "Roberta Fortes"

In [26]:
!git add fine-tuning.ipynb README.md requirements.txt

fatal: pathspec 'fine-tuning.ipynb' did not match any files


In [27]:
!ls /content

bert-dpto-classification  README.md	    sample_data  treino.jsonl
bert-suporte-venda-pt	  requirements.txt  teste.jsonl


In [28]:
!cp /content/fine-tuning.ipynb /content/bert-dpto-classification/
!cp /content/README.md /content/bert-dpto-classification/
!cp /content/requirements.txt /content/bert-dpto-classification/

cp: cannot stat '/content/fine-tuning.ipynb': No such file or directory
