## Download de Bibliotecas Necessárias

In [1]:
!pip3 install datasets==2.21.0 transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Collecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, sacrebleu, datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.1
    Uninstalling dataset

In [2]:
from datasets import load_dataset, load_metric, concatenate_datasets

In [None]:
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq, Seq2SeqTrainingArguments,
                          Seq2SeqTrainer)

## 1. Importação dos Datasets a Serem Utilizados

### 1.1 FAPESP

In [None]:
fapesp = load_dataset('caiogomide/dataset_fapesp')

In [None]:
fapesp = fapesp.rename_column('textos_pt','texto_portugues')

### 1.2 Documentações Técnicas

In [None]:
documentacoes_tecnicas = load_dataset('caiogomide/dataset_documentacoes_tecnicas')

In [None]:
dataset_treinamento = concatenate_datasets([fapesp['train'], documentacoes_tecnicas['train']]).shuffle(seed=42)

In [None]:
dataset_eval = concatenate_datasets([fapesp['validation'], documentacoes_tecnicas['validation']]).shuffle(seed=42)

## 2. Fine-tuning do modelo escolhido

In [None]:
model_checkpoint = 'Helsinki-NLP/opus-mt-tc-big-en-pt'

In [None]:
metric = load_metric('sacrebleu')

### 2.1 Pré-processamento de Dados

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
prefix = ""
max_input_length = 256
max_target_length = 256
source_lang = 'texto_ingles'
target_lang = 'texto_portugues'

In [None]:
def preprocessamento(exemplos):
  # Textos em inglês
  inputs = [prefix + ex for ex in exemplos['texto_ingles']]
  # Correspondentes em portugues
  targets = [ex for ex in exemplos['texto_portugues']]
  # Tokeniza o input
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
  # Tokeniza o output
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

In [None]:
tokenized_dataset_treinamento = dataset_treinamento.map(preprocessamento, batched=True)

In [None]:
tokenized_dataset_eval = dataset_eval.map(preprocessamento, batched=True)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# Definições de parâmetros para treinamento do modelo
batch_size = 12
model_name = model_checkpoint.split('/')[-1]

args = Seq2SeqTrainingArguments(
    f'{model_name}-finetuned-final',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=6,
    predict_with_generate=True,
    load_best_model_at_end=True,
    save_strategy = "epoch"
  )

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset_treinamento,
    eval_dataset=tokenized_dataset_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()