In [None]:
bert_model_path = "..\\custom_models\\bert_model"
extraction_dataset_path = "..\\custom_datasets\\extraction_dataset"
extraction_model_path = "..\\custom_models\\extracted_bert_model"
training_log_path = "extraction_training_log.txt"

In [1]:
import os
import sys
sys.path.append(os.path.realpath("../"))

In [2]:
import gc
import torch
import ModelParamFunctionsModule as mpfm
from datasets import load_from_disk
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import BertForPreTraining, DefaultDataCollator, get_scheduler

In [3]:
extraction_dataset = load_from_disk(extraction_dataset_path)

In [5]:
extraction_data_loader = DataLoader(extraction_dataset['train'], batch_size=16, shuffle=True, collate_fn=DefaultDataCollator(), pin_memory=True, pin_memory_device="cuda:0")

In [6]:
device = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")

In [7]:
bert_model = BertForPreTraining.from_pretrained(bert_model_path)
bert_model.to(device)

BertForPreTraining(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [8]:
bert_model.apply(mpfm.reinitialize_weights)
mpfm.print_params_details(bert_model)

bert.embeddings.word_embeddings.weight: mean=0.0000; std=0.0200
bert.embeddings.position_embeddings.weight: mean=-0.0001; std=0.0200
bert.embeddings.token_type_embeddings.weight: mean=0.0003; std=0.0196
bert.embeddings.LayerNorm.weight: mean=1.0000; std=0.0000
bert.embeddings.LayerNorm.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.query.weight: mean=0.0000; std=0.0200
bert.encoder.layer.0.attention.self.query.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.key.weight: mean=0.0000; std=0.0200
bert.encoder.layer.0.attention.self.key.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.self.value.weight: mean=-0.0000; std=0.0200
bert.encoder.layer.0.attention.self.value.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.output.dense.weight: mean=-0.0000; std=0.0200
bert.encoder.layer.0.attention.output.dense.bias: mean=0.0000; std=0.0000
bert.encoder.layer.0.attention.output.LayerNorm.weight: mean=1.0000; std=0.0000
bert.encoder.laye

In [9]:
num_training_steps = len(extraction_data_loader)
adam_optimizer = AdamW(bert_model.parameters(), lr=0.0001, betas=(0.9, 0.999), weight_decay=0.01)
lr_scheduler = get_scheduler("linear", adam_optimizer, num_warmup_steps=1000, num_training_steps=num_training_steps)

In [10]:
def treinar(training_progress_bar, stopping_point = 0):
    current_training_step = 0
    current_training_substep = 0
    training_substep_marker = 2000
    training_log = open(training_log_path, mode="a", encoding="utf-8")
    bert_model.train()
    while stopping_point == 0 or current_training_step < stopping_point:
        loss = 0
        for batch in extraction_data_loader:
            del(batch['idx'])
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = bert_model(**batch)
            loss = outputs.loss
            loss.backward()

            adam_optimizer.step()
            lr_scheduler.step()
            adam_optimizer.zero_grad()
            training_progress_bar.update(1)

            gc.collect()
            torch.cuda.empty_cache()
            
            current_training_step += 1
            current_training_substep += 1
            if(current_training_substep >= training_substep_marker):
                training_log.write(str(current_training_step) + ": " + str(loss) + "\n")
                current_training_substep = 0
            if(stopping_point > 0 and current_training_step >= stopping_point):
                break
        print(current_training_step, ": ", loss)
        if(stopping_point == 0):
            break
    training_log.close()

In [11]:
treinar(tqdm(range(num_training_steps)))

  0%|          | 0/5000 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 908.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 10.24 GiB is allocated by PyTorch, and 37.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [13]:
mpfm.print_params_details(bert_model)

bert.embeddings.word_embeddings.weight: mean=-0.0000; std=0.0331
bert.embeddings.position_embeddings.weight: mean=0.0000; std=0.0195
bert.embeddings.token_type_embeddings.weight: mean=-0.0002; std=0.0155
bert.embeddings.LayerNorm.weight: mean=0.8229; std=0.0844
bert.embeddings.LayerNorm.bias: mean=0.0001; std=0.0272
bert.encoder.layer.0.attention.self.query.weight: mean=-0.0000; std=0.0299
bert.encoder.layer.0.attention.self.query.bias: mean=-0.0020; std=0.0303
bert.encoder.layer.0.attention.self.key.weight: mean=-0.0000; std=0.0297
bert.encoder.layer.0.attention.self.key.bias: mean=-0.0003; std=0.0081
bert.encoder.layer.0.attention.self.value.weight: mean=-0.0000; std=0.0226
bert.encoder.layer.0.attention.self.value.bias: mean=-0.0017; std=0.0290
bert.encoder.layer.0.attention.output.dense.weight: mean=-0.0000; std=0.0224
bert.encoder.layer.0.attention.output.dense.bias: mean=-0.0019; std=0.0438
bert.encoder.layer.0.attention.output.LayerNorm.weight: mean=0.8721; std=0.0613
bert.encod

In [14]:
bert_model.save_pretrained(extraction_model_path)