## Passo 1 - Preparar Dataset do BERT e aplicar a Marca D'água nele.

In [None]:
register_watermarked_steps_flag = True
batch_size = 16
max_sequence_length = 256
marks_per_watermarked_entry = 3
num_training_steps = 150000
param_save_percentage = 0.8
param_save_starting_step = num_training_steps * param_save_percentage
training_substep_marker = 2000
watermark_influence_range = 2
watermark_probability = 0.15
sequence_length_changes = [512]
sequence_length_changes_step = [90000]
checkpoint = "bert-base-cased"
watermarked_bert_model_path = "..\\custom_models\\watermarked_bert_model"
tokenized_bookcorpus_dataset_path = '..\\custom_datasets\\tokenized_bookcorpus_lines_dataset'
tokenized_wikipedia_dataset_path = '..\\custom_datasets\\tokenized_wikipedia_lines_dataset'
watermark = "###"

In [1]:
import os
import sys
sys.path.append(os.path.realpath("../"))

In [2]:
import copy
import gc
import torch
import ModelParamFunctionsModule as mpfm
from datasets import load_from_disk
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import BertForPreTraining, DefaultDataCollator, get_scheduler
from WatermarkedTokenizedBERTDatasetModule import WatermarkedTokenizedBERTDataset

In [3]:
#Importar os Dicionários de Datasets quem possuem as informações que vão compor o Dataset de treinamento do BERT.

tokenized_bookcorpus_dataset_dict = load_from_disk(tokenized_bookcorpus_dataset_path)
tokenized_wikipedia_dataset_dict = load_from_disk(tokenized_wikipedia_dataset_path)

Loading dataset from disk:   0%|          | 0/18 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/42 [00:00<?, ?it/s]

In [4]:
#Preparar a Marca D'água, e criar o Dataset de treinamento do BERT usando ela e os Datasets de treinamento dos Dicionários que foram importados
# anteriormente.

watermarked_bert_dataset = WatermarkedTokenizedBERTDataset(
    [tokenized_bookcorpus_dataset_dict['train'], tokenized_wikipedia_dataset_dict['train']], watermark_pattern=watermark,
    max_sequence_length=max_sequence_length, sequence_length_changes=sequence_length_changes, sequence_length_changes_step=sequence_length_changes_step,
    watermark_probability=watermark_probability, watermark_influence_range=watermark_influence_range,
    marks_per_watermarked_entry=marks_per_watermarked_entry, register_watermarked_steps_flag=register_watermarked_steps_flag
)



In [6]:
#Inicializar o Data Loader do treianmento.

train_data_loader = DataLoader(
    watermarked_bert_dataset, batch_size=batch_size, shuffle=True, collate_fn=DefaultDataCollator(), pin_memory=True, pin_memory_device="cuda:0"
)

<torch.utils.data.dataloader.DataLoader object at 0x000001C41EA0EA50>


## Passo 2 - Preparar o Modelo BERT para o Pré-Treinamento com Marca D'água

In [7]:
#Importar o Modelo de Pré-Treinamento do BERT, vindo do repositório HuggingFace.

bert_model = BertForPreTraining.from_pretrained(checkpoint)

In [8]:
#Exibir os parâmetros do modelo importado.

mpfm.print_params_details(bert_model)

bert.embeddings.word_embeddings.weight: mean=-0.0138; std=0.0448
bert.embeddings.position_embeddings.weight: mean=0.0000; std=0.0146
bert.embeddings.token_type_embeddings.weight: mean=-0.0005; std=0.0257
bert.embeddings.LayerNorm.weight: mean=0.8867; std=0.0925
bert.embeddings.LayerNorm.bias: mean=-0.0199; std=0.0600
bert.encoder.layer.0.attention.self.query.weight: mean=-0.0000; std=0.0345
bert.encoder.layer.0.attention.self.query.bias: mean=-0.0103; std=0.2145
bert.encoder.layer.0.attention.self.key.weight: mean=0.0000; std=0.0340
bert.encoder.layer.0.attention.self.key.bias: mean=-0.0000; std=0.0016
bert.encoder.layer.0.attention.self.value.weight: mean=0.0000; std=0.0253
bert.encoder.layer.0.attention.self.value.bias: mean=0.0008; std=0.0372
bert.encoder.layer.0.attention.output.dense.weight: mean=-0.0000; std=0.0248
bert.encoder.layer.0.attention.output.dense.bias: mean=-0.0018; std=0.0330
bert.encoder.layer.0.attention.output.LayerNorm.weight: mean=0.9871; std=0.0899
bert.encoder

In [9]:
#Resetar os parâmetros do modelo, para que ele possa ser corretamente treinado sem qualquer influência dos valores originais.

bert_model.apply(mpfm.reinitialize_weights)
mpfm.print_params_details(bert_model)

bert.embeddings.word_embeddings.weight: mean=0.0000; std=0.0200
bert.embeddings.position_embeddings.weight: mean=0.0001; std=0.0200
bert.embeddings.token_type_embeddings.weight: mean=0.0009; std=0.0201
bert.embeddings.LayerNorm.weight: mean=1.0000; std=0.0000
bert.embeddings.LayerNorm.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.query.weight: mean=-0.0000; std=0.0200
bert.encoder.layer.0.attention.self.query.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.key.weight: mean=-0.0000; std=0.0200
bert.encoder.layer.0.attention.self.key.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.value.weight: mean=-0.0001; std=0.0200
bert.encoder.layer.0.attention.self.value.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.output.dense.weight: mean=0.0000; std=0.0200
bert.encoder.layer.0.attention.output.dense.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.output.LayerNorm.weight: mean=1.0000; std=0.0000
bert.encoder.laye

In [10]:
#Alterar o dispositivo de treinamento para a GPU, caso uma esteja disponível, já que isso acelera o processo de ajuste dos pesos.

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
bert_model.to(device)

cuda
cuda:0


## Passo 3 - Preparar as Ferramentas e Parâmetros de Treinamento

In [11]:
#Definir o número de passos de treinamento, e carregar as ferramentas para este. Especificamente, são utilizados o otimizador AdamW e uma agendador de
# taxa de aprendizado linear, iguais aos utilizados no treinamento sem marca d'água.

adam_optimizer = AdamW(bert_model.parameters(), lr=0.0001, betas=(0.9, 0.999), weight_decay=0.01)
lr_scheduler = get_scheduler("linear", adam_optimizer, num_warmup_steps=1000, num_training_steps=num_training_steps)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    initial_lr: 0.0001
    lr: 0.0
    maximize: False
    weight_decay: 0.01
)
<torch.optim.lr_scheduler.LambdaLR object at 0x000001C440FB1190>


## Passo 4 - Realizar o Treinamento

In [12]:
gc.collect()
torch.cuda.empty_cache()

In [13]:
#Definir a função de treinamento

def treinar(bert_model, training_progress_bar):
    current_training_step = 0
    current_training_substep = 0
    loss = 0
    saved_model = None
    saved_model_loss = None
    training_log = open("training_log.txt", mode="a", encoding="utf-8")
    bert_model.train()
    while current_training_step < num_training_steps:
        for batch in train_data_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = bert_model(**batch)
            loss = outputs.loss
            loss.backward()

            adam_optimizer.step()
            lr_scheduler.step()
            adam_optimizer.zero_grad()
            training_progress_bar.update(1)

            gc.collect()
            torch.cuda.empty_cache()

            current_training_step += 1
            current_training_substep += 1

            if(current_training_step >= param_save_starting_step):
                if(saved_model_loss is None or saved_model_loss > loss):
                    training_log.write("The current model has lower loss than the Saved one. Saving current model...")
                    saved_model = copy.deepcopy(bert_model)
                    saved_model_loss = loss
            
            if(current_training_substep >= training_substep_marker):
                training_log.write(str(current_training_step) + ": " + str(loss) + "\n")
                current_training_substep = 0
            if(current_training_step >= num_training_steps):
                break
    if(loss > saved_model_loss):
        training_log.write("The last Saved Model has a smaller Loss (" + str(saved_model_loss) + " < " + str(loss) +
                           "). Setting the Final Model as the Saved Model...")
        bert_model = saved_model
    training_log.close()

In [14]:
#Realizar o treinamento

treinar(bert_model, tqdm(range(num_training_steps)))

  0%|          | 0/100000 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Passo 5 - Salvar o Modelo Localmente

In [None]:
mpfm.print_params_details(bert_model)

In [None]:
#Salvar o modelo

bert_model.save_pretrained("../../custom_models/watermarked_bert_model")