## Passo 1 - Preparar Dataset do BERT

In [None]:
batch_size = 16
checkpoint = "bert-base-cased"
num_training_steps = 25000
param_save_percentage = 0.8
param_save_starting_step = num_training_steps * param_save_percentage
bert_model_path = "..\\custom_models\\bert_model"
tokenized_bookcorpus_dataset_path = '..\\custom_datasets\\tokenized_bookcorpus_lines_dataset'
tokenized_wikipedia_dataset_path = '..\\custom_datasets\\tokenized_wikipedia_lines_dataset'
training_substep_marker = 2000
training_log_file_path = "training_log.txt"

In [3]:
import os
import sys
sys.path.append(os.path.realpath("../"))

In [4]:
import copy
import gc
import torch
import ModelParamFunctionsModule as mpfm
from datasets import load_from_disk, DatasetDict
from transformers import BertForPreTraining, DefaultDataCollator, get_scheduler
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from TokenizedBERTDatasetModule import TokenizedBERTDataset

In [5]:
#Importar os Dicionários de Datasets que compõem o Dicionário de Dataset do BERT. Especificamente, são os Dicionários personalizaados do Bookcorpus e da
# Wikipedia.

tokenized_bookcorpus_dataset_dict = load_from_disk(tokenized_bookcorpus_dataset_path)
tokenized_wikipedia_dataset_dict = load_from_disk(tokenized_wikipedia_dataset_path)

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/42 [00:00<?, ?it/s]

In [11]:
#Inicializar o Dataset do BERT.

tokenized_bert_dataset_dict = DatasetDict({
    'train': TokenizedBERTDataset([tokenized_wikipedia_dataset_dict['train'], tokenized_bookcorpus_dataset_dict['train']], max_sequence_length=256,
          sequence_length_changes=[512], sequence_length_changes_step=[2500]),
})

In [13]:
#Inicializar os Data Loaders. O tamanho de batch usado no treinamento original do BERT era 256, o valor 16 está sendo usado aqui para facilitar os testes.

train_data_loader = DataLoader(
    tokenized_bert_dataset_dict['train'], batch_size=batch_size, shuffle=True, collate_fn=DefaultDataCollator(), pin_memory=True, pin_memory_device="cuda:0"
)

<torch.utils.data.dataloader.DataLoader object at 0x000002188CE51710>


## Passo 2 - Preparar o Modelo BERT de Pré-Treinamento

In [14]:
#Importar o Modelo de Pré-Treinamento do BERT, vindo do repositório da HuggingFace.

bert_model = BertForPreTraining.from_pretrained(checkpoint)

In [16]:
#Alterar o dispositivo de execução do Modelo para a GPU, caso uma esteja disponível, já que isso acelera o processo de ajuste dos pesos.

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
bert_model.to(device)

cuda
cuda:0


In [None]:
#Exibir os prâmetros iniciais do modelo.

mpfm.print_params_details(bert_model)

In [18]:
#Reiniciar os pesos do modelo, para que o pré-treinamento não seja influenciado pelos valores iniciais do Hugging Face.

bert_model.apply(mpfm.reinitialize_weights)
mpfm.print_params_details(bert_model)

bert.embeddings.word_embeddings.weight: mean=-0.0000; std=0.0200
bert.embeddings.position_embeddings.weight: mean=0.0000; std=0.0200
bert.embeddings.token_type_embeddings.weight: mean=-0.0002; std=0.0197
bert.embeddings.LayerNorm.weight: mean=1.0000; std=0.0000
bert.embeddings.LayerNorm.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.query.weight: mean=0.0000; std=0.0200
bert.encoder.layer.0.attention.self.query.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.key.weight: mean=-0.0000; std=0.0200
bert.encoder.layer.0.attention.self.key.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.value.weight: mean=0.0000; std=0.0200
bert.encoder.layer.0.attention.self.value.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.output.dense.weight: mean=-0.0000; std=0.0200
bert.encoder.layer.0.attention.output.dense.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.output.LayerNorm.weight: mean=1.0000; std=0.0000
bert.encoder.lay

## Passo 3 - Preparar Ferramentas e Parâmetros de Treinamento

In [19]:
#Preparar as ferramentas de treinamento e determinar o número de passos de treinamento.

adam_optimizer = AdamW(bert_model.parameters(), lr=0.0001, betas=(0.9, 0.999), weight_decay=0.01)
lr_scheduler = get_scheduler("linear", adam_optimizer, num_warmup_steps=10000, num_training_steps=num_training_steps)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.0001
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)
<torch.optim.lr_scheduler.LambdaLR object at 0x000002211D829790>


## Passo 4 - Realizar Treinamento

In [20]:
gc.collect()
torch.cuda.empty_cache()

In [21]:
#Definir a função de treinamento.

def treinar(bert_model, training_progress_bar):
    current_training_step = 0
    current_training_substep = 0
    loss = 0
    saved_model = None
    saved_model_loss = None
    training_log = open(training_log_file_path, mode="a", encoding="utf-8")
    bert_model.train()
    while current_training_step < num_training_steps:
        for batch in train_data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = bert_model(**batch)
            loss = outputs.loss
            loss.backward()

            adam_optimizer.step()
            lr_scheduler.step()
            adam_optimizer.zero_grad()
            training_progress_bar.update(1)

            gc.collect()
            torch.cuda.empty_cache()
            
            current_training_step += 1
            current_training_substep += 1

            if(current_training_step > param_save_starting_step):
                if(saved_model_loss is None or saved_model_loss > loss):
                    training_log.write(f"Updating Saved Loss: Old Value: {saved_model_loss}; New Value: {loss}.")
                    saved_model = copy.deepcopy(bert_model)
                    saved_model_loss = loss
            
            if(current_training_substep >= training_substep_marker):
                training_log.write(str(current_training_step) + ": " + str(loss) + "\n")
                current_training_substep = 0
            if(current_training_step >= num_training_steps):
                break
    if(loss > saved_model_loss):
        training_log.write("The last Saved Model has a smaller Loss (" + str(saved_model_loss) + " < " + str(loss) +
                           "). Setting the Final Model as the Saved Model...")
        bert_model = saved_model
    training_log.close()

In [22]:
# Realizar o treinamento.

treinar(bert_model, tqdm(range(num_training_steps)))

  0%|          | 0/25000 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 908.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.24 GiB is allocated by PyTorch, and 37.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Passo 5 - Salvar o Modelo Localmente

In [19]:
#Salvar o modelo.

bert_model.save_pretrained(bert_model_path)