# **BERT PAIR Relation Extraction Notebook**


## Imports and environment configuration

In [37]:
!pip install transformers==3.0.0
!pip install ipython-autotime

%load_ext autotime

Collecting transformers==3.0.0

  error: subprocess-exited-with-error
  
  × Building wheel for tokenizers (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [48 lines of output]
      C:\Users\elsab\AppData\Local\Temp\pip-build-env-l96wj4x7\overlay\Lib\site-packages\setuptools\dist.py:314: InformationOnly: Normalizing '0.8.0.rc4' to '0.8.0rc4'
        self.metadata.version = self._normalize_version(self.metadata.version)
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-311
      creating build\lib.win-amd64-cpython-311\tokenizers
      copying tokenizers\__init__.py -> build\lib.win-amd64-cpython-311\tokenizers
      creating build\lib.win-amd64-cpython-311\tokenizers\models
      copying tokenizers\models\__init__.py -> build\lib.win-amd64-cpython-311\tokenizers\models
      creating build\lib.win-amd64-cpython-311\tokenizers\decoders
      copying tokenizers\decoders\__init__.py -> build\lib.win-amd64-cpython-311\tok


  Using cached transformers-3.0.0-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers==0.8.0-rc4 (from transformers==3.0.0)
  Using cached tokenizers-0.8.0rc4.tar.gz (96 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting sentencepiece (from transformers==3.0.0)
  Using cached sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Collecting sacremoses (from transformers==3.0.0)
  Using cached sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting click (from sacremoses->transformers==3.0.0)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Using cached transformers-3.0.0-py3-none-any.whl (754 kB)
Using cached sacremoses-0.1.1-py3-none-any.whl (897 kB)
Using cached

In [38]:
import os
import sys
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim

basepath = Path(os.getcwd())
sys.path.append(os.path.join(basepath, 'models'))
sys.path.append(os.path.join(basepath, 'models', "imported_configs"))

from model_files.modeling_bert import BertModel as Model
from tokens_files.tokenization_bert import BertTokenizer as Tokenizer

time: 16 ms (started: 2024-03-06 16:45:09 +00:00)


In [39]:
import warnings
warnings.filterwarnings('ignore')

time: 0 ns (started: 2024-03-06 16:45:09 +00:00)


## Matching the Blanks Pre-Training

The pre-training process of Matching the Blanks can run for multiple days, even with GPU support. Therefore a already pre-trained model is provided in the GitLab repository. For additional information see README.

In [40]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz

import en_core_web_lg

In [None]:
import os
import math
import time

time: 16 ms (started: 2024-03-06 16:12:07 +00:00)


### Pre-Training Helper functions

In [None]:
from imported_configs.helper_functions.pretrain_helper_functions import Two_Headed_Loss, pretrain_dataset, load_state, create_pretraining_corpus, process_textlines, mtb_evaluate_

time: 0 ns (started: 2024-03-06 16:12:07 +00:00)


In [None]:
def mtb_load_dataloaders(pretrain_data, batch_size, max_length=50000):
    print("Loading pre-training data...")
    with open(pretrain_data, "r", encoding="utf8") as f:
        text = f.readlines()
    
    text = process_textlines(text)
    
    print("Length of text (characters): %d" % len(text))
    num_chunks = math.ceil(len(text)/max_length)
    print("Splitting into %d max length chunks of size %d" % (num_chunks, max_length))
    text_chunks = (text[i*max_length:(i*max_length + max_length)] for i in range(num_chunks))
    
    D = []
    print("Loading Spacy NLP...")
    nlp = en_core_web_lg.load()
    
    for text_chunk in text_chunks:
        D.extend(create_pretraining_corpus(text_chunk, nlp, window_size=40))
        
    print("Total number of relation statements in pre-training corpus: %d" % len(D))

    train_set = pretrain_dataset(D, tokenizer, batch_size=batch_size)
    return train_set

time: 0 ns (started: 2024-03-06 16:12:07 +00:00)


### Pre-Training with Matching the Blanks

Definition of parameters for pre-training with Matching the Blanks

In [None]:
num_epochs=2
freeze=0
lr=0.0001
max_norm=1.0
gradient_acc_steps=2
batch_size=4
pretrain_data=os.path.join(basepath, "fewrel-training-data","cnn.txt")
checkpoint_path = os.path.join(basepath, "checkpoint_files","pretrain_checkpoint_BERT_1.pth.tar")

time: 0 ns (started: 2024-03-06 16:12:07 +00:00)


Loading model and tokenizer and initialize optimizer and scheduler for training routine.

In [None]:
model_name = 'bert-base-uncased'
lower_case=True

tokenizer = Tokenizer.from_pretrained(model_name, do_lower_case=lower_case)
tokenizer.add_tokens(['[E1]', '[/E1]', '[E2]', '[/E2]', '[BLANK]'])

mtb_model = Model.from_pretrained(model_name, force_download=False)
mtb_model.resize_token_embeddings(len(tokenizer)) 
cuda = torch.cuda.is_available()

if cuda:
    print("Cuda is on")
    mtb_model.cuda()

if freeze == 1:
    print("FREEZING MOST HIDDEN LAYERS...")
    unfrozen_layers = ["classifier", "pooler", "encoder.layer.11", "encoder.layer.10",\
                        "encoder.layer.9", "blanks_linear", "lm_linear", "cls"]
        
    for name, param in mtb_model.named_parameters():
        if not any([layer in name for layer in unfrozen_layers]):
            print("[FROZE]: %s" % name)
            param.requires_grad = False
        else:
            print("[FREE]: %s" % name)
            param.requires_grad = True
    
criterion = Two_Headed_Loss(lm_ignore_idx=tokenizer.pad_token_id, use_logits=True, normalize=False)
optimizer = optim.Adam([{"params":mtb_model.parameters(), "lr": lr}])

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8)

time: 3.38 s (started: 2024-03-06 16:12:07 +00:00)


Loading pre-training data from inputfile.

In [None]:
train_loader = mtb_load_dataloaders(pretrain_data, batch_size)
train_len = len(train_loader)

Loading pre-training data...
Length of text (characters): 1041308
Splitting into 21 max length chunks of size 50000
Loading Spacy NLP...
Total number of relation statements in pre-training corpus: 14646
time: 36.6 s (started: 2024-03-06 16:12:11 +00:00)


Load checkpoint if available to continue training from this point.

In [None]:
start_epoch, best_pred = load_state(mtb_model, optimizer, scheduler, checkpoint_path)

Loaded checkpoint model.
Loaded model and optimizer.
time: 1.38 s (started: 2024-03-06 16:12:47 +00:00)


Pre-Training process with Matching the Blank method. Caution: This kind of training runs at least 10 to 12 hours on limited hardware.

In [None]:
losses_per_epoch=[]
accuracy_per_epoch=[]

print("Starting training process...")
pad_id = tokenizer.pad_token_id
mask_id = tokenizer.mask_token_id
update_size = len(train_loader)//10

for epoch in range(start_epoch, num_epochs):
    start_time = time.time()
    mtb_model.train(); total_loss = 0.0; losses_per_batch = []; total_acc = 0.0; lm_accuracy_per_batch = []
    for i, data in enumerate(train_loader, 0):
        x, masked_for_pred, e1_e2_start, _, blank_labels, _,_,_,_,_ = data
        masked_for_pred1 =  masked_for_pred
        masked_for_pred = masked_for_pred[(masked_for_pred != pad_id)]
        if masked_for_pred.shape[0] == 0:
            print('Empty dataset, skipping...')
            continue
        attention_mask = (x != pad_id).float()
        token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long()

        if cuda:
            x = x.cuda(); masked_for_pred = masked_for_pred.cuda()
            attention_mask = attention_mask.cuda()
            token_type_ids = token_type_ids.cuda()
        
        blanks_logits, lm_logits, _ = mtb_model(x, token_type_ids=token_type_ids, attention_mask=attention_mask, Q=None,\
                      e1_e2_start=e1_e2_start)
        lm_logits = lm_logits[(x == mask_id)]
        
        if (i % update_size) == (update_size - 1):
            verbose = True
        else:
            verbose = False
            
        loss = criterion(lm_logits, blanks_logits, masked_for_pred, blank_labels, verbose=verbose)
        loss = loss/gradient_acc_steps

        loss.backward()

        grad_norm = nn.utils.clip_grad_norm_(mtb_model.parameters(), max_norm)
        
        if (i % gradient_acc_steps) == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item()
        total_acc += mtb_evaluate_(lm_logits, blanks_logits, masked_for_pred, blank_labels, \
                                tokenizer, print_=False)[0]
        
        if (i % update_size) == (update_size - 1):
            losses_per_batch.append(gradient_acc_steps*total_loss/update_size)
            lm_accuracy_per_batch.append(total_acc/update_size)
            print('[Epoch: %d, %5d/ %d points] total loss, lm accuracy per batch: %.3f, %.3f' %
                  (epoch + 1, (i + 1), train_len, losses_per_batch[-1], lm_accuracy_per_batch[-1]))
            total_loss = 0.0; total_acc = 0.0
            print("Last batch samples (pos, neg): %d, %d" % ((blank_labels.squeeze() == 1).sum().item(),\
                                                                (blank_labels.squeeze() == 0).sum().item()))
    
    scheduler.step()
    losses_per_epoch.append(sum(losses_per_batch)/len(losses_per_batch))
    accuracy_per_epoch.append(sum(lm_accuracy_per_batch)/len(lm_accuracy_per_batch))
    print("Losses at Epoch %d: %.7f" % (epoch + 1, losses_per_epoch[-1]))
    print("Accuracy at Epoch %d: %.7f" % (epoch + 1, accuracy_per_epoch[-1]))
    
    torch.save({
            'epoch': epoch + 1,\
            'state_dict': mtb_model.state_dict(),\
            'best_acc': accuracy_per_epoch[-1],\
            'optimizer' : optimizer.state_dict(),\
            'scheduler' : scheduler.state_dict(),\
            'amp': None
        }, os.path.join(basepath, "checkpoint_files","pretrain","pretrain_checkpoint_BERT_1.pth.tar"))

print("Finished Training!")

Starting training process...
Finished Training!
time: 0 ns (started: 2024-03-06 16:12:49 +00:00)
