# **BERT PAIR Relation Extraction Notebook**


## Imports and environment configuration

In [1]:
!pip install transformers==3.0.0
!pip install ipython-autotime

%load_ext autotime

Collecting transformers==3.0.0
  Using cached transformers-3.0.0-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers==0.8.0-rc4 (from transformers==3.0.0)
  Using cached tokenizers-0.8.0rc4.tar.gz (96 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting sentencepiece (from transformers==3.0.0)
  Using cached sentencepiece-0.2.0-cp39-cp39-win_amd64.whl.metadata (8.3 kB)
Collecting sacremoses (from transformers==3.0.0)
  Using cached sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting click (from sacremoses->transformers==3.0.0)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Using cached transformers-3.0.0-py3-none-any.whl (754 kB)
Using cached sacremoses-0.1.1-py3-none-a

  error: subprocess-exited-with-error
  
  × Building wheel for tokenizers (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [48 lines of output]
      C:\Users\elsab\AppData\Local\Temp\pip-build-env-u7530r7c\overlay\Lib\site-packages\setuptools\dist.py:314: InformationOnly: Normalizing '0.8.0.rc4' to '0.8.0rc4'
        self.metadata.version = self._normalize_version(self.metadata.version)
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-39
      creating build\lib.win-amd64-cpython-39\tokenizers
      copying tokenizers\__init__.py -> build\lib.win-amd64-cpython-39\tokenizers
      creating build\lib.win-amd64-cpython-39\tokenizers\models
      copying tokenizers\models\__init__.py -> build\lib.win-amd64-cpython-39\tokenizers\models
      creating build\lib.win-amd64-cpython-39\tokenizers\decoders
      copying tokenizers\decoders\__init__.py -> build\lib.win-amd64-cpython-39\tokenizers

time: 0 ns (started: 2024-03-06 22:00:04 +00:00)


In [2]:
import os
import sys
import json
import random
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from transformers import BertTokenizer, BertForSequenceClassification



basepath = Path(os.getcwd())


print('Running locally')
root = Path(os.getcwd())
basepath = Path(os.getcwd())
sys.path.append(os.path.join(basepath, 'models', "imported_configs"))

from model_files.modeling_bert import BertModel as Model
from tokens_files.tokenization_bert import BertTokenizer as Tokenizer


  from .autonotebook import tqdm as notebook_tqdm


Running locally
time: 10.2 s (started: 2024-03-06 22:00:04 +00:00)


In [3]:
sys.path.append(os.path.join(basepath, 'models'))

from pair import Pair
from framework import FewShotREFramework
import warnings
warnings.filterwarnings('ignore')


time: 15 ms (started: 2024-03-06 22:00:14 +00:00)


## Matching the Blanks Pre-Training

The pre-training process of Matching the Blanks can run for multiple days, even with GPU support. Therefore a already pre-trained model is provided in the GitLab repository. For additional information see README.

In [4]:
import en_core_web_lg

time: 406 ms (started: 2024-03-06 22:00:14 +00:00)


In [5]:
import os
import math
import time
import json
from pathlib import Path

time: 0 ns (started: 2024-03-06 22:00:15 +00:00)


Definition of parameters for pre-training with Matching the Blanks

In [7]:
num_epochs=1
freeze=0
lr=0.0001
max_norm=1.0
gradient_acc_steps=2
batch_size=4
checkpoint_path = os.path.join(basepath, "checkpoint_files","pretrain_checkpoint_BERT_1.pth.tar")


time: 0 ns (started: 2024-03-06 22:00:15 +00:00)


In [8]:

data_dir = os.path.join( os.getcwd(),'fewrel-training-data/')

train_file = os.path.join(data_dir, 'train_wiki')
val_file = os.path.join(data_dir, 'val_wiki')
test_file = os.path.join(data_dir, 'val_pubmed')


time: 0 ns (started: 2024-03-06 22:00:15 +00:00)


# BERT PAIR

Sentence encoder class for the BERT Pair approach which manages the model and the tokenizer

In [9]:
class BERTPAIRSentenceEncoder(nn.Module):
    def __init__(self, pretrain_path, max_length): 
        nn.Module.__init__(self)
        self.bert = BertForSequenceClassification.from_pretrained(
                pretrain_path)
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, inputs):
        x = self.bert(inputs['word'], token_type_ids=inputs['seg'], attention_mask=inputs['mask'])[0]
        return x
    
    def tokenize(self, raw_tokens, pos_head, pos_tail):
        # token -> index
        # tokens = ['[CLS]']
        tokens = []
        cur_pos = 0
        pos1_in_index = 0
        pos2_in_index = 0
        for token in raw_tokens:
            token = token.lower()
            if cur_pos == pos_head[0]:
                tokens.append('[unused0]')
                pos1_in_index = len(tokens)
            if cur_pos == pos_tail[0]:
                tokens.append('[unused1]')
                pos2_in_index = len(tokens)
            tokens += self.tokenizer.tokenize(token)
            if cur_pos == pos_head[-1]:
                tokens.append('[unused2]')
            if cur_pos == pos_tail[-1]:
                tokens.append('[unused3]')
            cur_pos += 1
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokens)
        
        return indexed_tokens

time: 0 ns (started: 2024-03-06 22:00:15 +00:00)


## Fine-Tuning

Defining some parameters for training of the model

In [10]:
trainN = 5
N = 5
K = 1
Q = 1
batch_size = 4
max_length = 3
hidden_size = 3
na_rate = 5

val_step = 1000
train_iter = 1000
val_iter = 1000
test_iter = 1000

ckpt = os.path.join(basepath, 'checkpoint_files','bert-pair-fewrel.pth.tar')
prefix = 'bert-pair-fewrel.pth.tar'


time: 0 ns (started: 2024-03-06 22:00:15 +00:00)


Initializing sentence encoder and model for BERT Pair

In [11]:
sentence_encoder = BERTPAIRSentenceEncoder('bert-base-uncased', max_length)

model = Pair(sentence_encoder, hidden_size=hidden_size)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


time: 2.56 s (started: 2024-03-06 22:00:15 +00:00)


In [12]:
checkpoint_path = os.path.join(basepath, 'checkpoint_files\pretrain_checkpoint_BERT_1.pth.tar')

print("Loading model pre-trained on blanks ...")
checkpoint = torch.load(checkpoint_path, map_location="cpu")
model_dict = model.state_dict()
pretrained_dict = {k: v for k, v in checkpoint['state_dict'].items() if k in model_dict.keys()}
model_dict.update(pretrained_dict)


Loading model pre-trained on blanks ...
time: 1.75 s (started: 2024-03-06 22:00:18 +00:00)


Loading train-, validation- and test-data and initializing the FewShotREFramework with the different data loaders

In [13]:
import torch.utils.data as data
class FewRelDatasetPair(data.Dataset):
    """
    FewRel Pair Dataset
    """
    def __init__(self, name, encoder, N, K, Q, na_rate, root, encoder_name):

        self.root = root
        path = os.path.join(root, name + ".json")
        if not os.path.exists(path):
            print("[ERROR] Data file does not exist!")
            assert(0)

        self.json_data = json.load(open(path))

        self.classes = list(self.json_data.keys())
        self.N = N
        self.K = K
        self.Q = Q
        self.na_rate = na_rate
        self.encoder = encoder
        self.encoder_name = encoder_name
        self.max_length = encoder.max_length

    def __getraw__(self, item):
        word = self.encoder.tokenize(item['tokens'],
            item['h'][2][0],
            item['t'][2][0])
        return word 

    def __additem__(self, d, word, pos1, pos2, mask):
        d['word'].append(word)
        d['pos1'].append(pos1)
        d['pos2'].append(pos2)
        d['mask'].append(mask)

    def __getitem__(self, index):
        target_classes = random.sample(self.classes, self.N)
        support = []
        query = []
        fusion_set = {'word': [], 'mask': [], 'seg': []}
        query_label = []
        Q_na = int(self.na_rate * self.Q)
        na_classes = list(filter(lambda x: x not in target_classes,  
            self.classes))

        for i, class_name in enumerate(target_classes):
            indices = np.random.choice(
                    list(range(len(self.json_data[class_name]))), 
                    self.K + self.Q, False)
            count = 0
            for j in indices:
                word  = self.__getraw__(
                        self.json_data[class_name][j])
                if count < self.K:
                    support.append(word)
                else:
                    query.append(word)
                count += 1

            query_label += [i] * self.Q

        # NA
        for j in range(Q_na):
            cur_class = np.random.choice(na_classes, 1, False)[0]
            index = np.random.choice(
                    list(range(len(self.json_data[cur_class]))),
                    1, False)[0]
            word = self.__getraw__(
                    self.json_data[cur_class][index])
            query.append(word)
        query_label += [self.N] * Q_na

        for word_query in query:
            for word_support in support:
                if self.encoder_name == 'bert':
                    SEP = self.encoder.tokenizer.convert_tokens_to_ids(['[SEP]'])
                    CLS = self.encoder.tokenizer.convert_tokens_to_ids(['[CLS]'])
                    word_tensor = torch.zeros((self.max_length)).long()
                else:
                    SEP = self.encoder.tokenizer.convert_tokens_to_ids(['</s>'])     
                    CLS = self.encoder.tokenizer.convert_tokens_to_ids(['<s>'])
                    word_tensor = torch.ones((self.max_length)).long()
                new_word = CLS + word_support + SEP + word_query + SEP
                for i in range(min(self.max_length, len(new_word))):
                    word_tensor[i] = new_word[i]
                mask_tensor = torch.zeros((self.max_length)).long()
                mask_tensor[:min(self.max_length, len(new_word))] = 1
                seg_tensor = torch.ones((self.max_length)).long()
                seg_tensor[:min(self.max_length, len(word_support) + 1)] = 0
                fusion_set['word'].append(word_tensor)
                fusion_set['mask'].append(mask_tensor)
                fusion_set['seg'].append(seg_tensor)

        return fusion_set, query_label
    
    def __len__(self):
        return 10000000

time: 16 ms (started: 2024-03-06 22:00:20 +00:00)


In [14]:
def collate_fn_pair(data):
    batch_set = {'word': [], 'seg': [], 'mask': []}
    batch_label = []
    fusion_sets, query_labels = zip(*data)
    for i in range(len(fusion_sets)):
        for k in fusion_sets[i]:
            batch_set[k] += fusion_sets[i][k]
        batch_label += query_labels[i]
    for k in batch_set:
        batch_set[k] = torch.stack(batch_set[k], 0)
    batch_label = torch.tensor(batch_label)
    return batch_set, batch_label

def get_loader_pair(name, encoder, N, K, Q, batch_size, 
        num_workers=0, collate_fn=collate_fn_pair, na_rate=0, root='./data', encoder_name='bert'):
    dataset = FewRelDatasetPair(name, encoder, N, K, Q, na_rate, root, encoder_name)
    data_loader = torch.utils.data.DataLoader(dataset=dataset,
            batch_size=batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=num_workers,
            collate_fn=collate_fn)
    return iter(data_loader)

time: 0 ns (started: 2024-03-06 22:00:20 +00:00)


In [15]:
train_data_loader = get_loader_pair(train_file, sentence_encoder, N=trainN, K=K, Q=Q, na_rate=na_rate, batch_size=batch_size, encoder_name='bert')
val_data_loader = get_loader_pair(val_file, sentence_encoder, N=N, K=K, Q=Q, na_rate=na_rate, batch_size=batch_size, encoder_name='bert')
test_data_loader = get_loader_pair(test_file, sentence_encoder, N=N, K=K, Q=Q, na_rate=na_rate, batch_size=batch_size, encoder_name='bert')

framework = FewShotREFramework(train_data_loader, val_data_loader, test_data_loader)


time: 953 ms (started: 2024-03-06 22:00:20 +00:00)


Training the model using the provided FewShotREFramework from the authors of the FewRel dataset

In [16]:
framework.train(model, prefix, batch_size, trainN, N, K, Q,
        pytorch_optim=optim.SGD, na_rate=na_rate, val_step=val_step, pair=True, 
        train_iter=train_iter, val_iter=val_iter, bert_optim=True,
        save_ckpt=ckpt, load_ckpt=pretrained_dict)

Start training...
Use bert optim!
step:    1 | loss: 1.919570, accuracy: 10.00%
step:    2 | loss: 1.922436, accuracy: 7.50%
step:    3 | loss: 1.911219, accuracy: 10.83%
step:    4 | loss: 1.918554, accuracy: 10.63%
step:    5 | loss: 1.916086, accuracy: 10.50%
step:    6 | loss: 1.924606, accuracy: 9.17%
step:    7 | loss: 1.923395, accuracy: 8.57%
step:    8 | loss: 1.914458, accuracy: 10.00%
step:    9 | loss: 1.908876, accuracy: 11.11%
step:   10 | loss: 1.907902, accuracy: 11.25%
step:   11 | loss: 1.905148, accuracy: 10.91%
step:   12 | loss: 1.899234, accuracy: 11.25%
step:   13 | loss: 1.894172, accuracy: 11.35%
step:   14 | loss: 1.884616, accuracy: 12.32%
step:   15 | loss: 1.873880, accuracy: 12.67%
step:   16 | loss: 1.867641, accuracy: 13.59%
step:   17 | loss: 1.858376, accuracy: 14.85%
step:   18 | loss: 1.850629, accuracy: 16.67%
step:   19 | loss: 1.841219, accuracy: 18.29%
step:   20 | loss: 1.830702, accuracy: 19.62%
step:   21 | loss: 1.821647, accuracy: 20.83%
ste