# Entity Relation Extraction using R-BERT




In [2]:
# Connect to google drive (where the data is, to access it):
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# 1. Install dependencies, import modules and load helper functions


In [3]:
! pip install transformers #

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 10.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 897 kB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [4]:
# Classes for storing individual sentences:

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self,
                 input_ids,
                 input_mask,
                 e11_p, e12_p, e21_p, e22_p,
                 e1_mask, e2_mask,
                 segment_ids,
                 label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

        #add enitity position and entity mask for BERT
        self.e11_p = e11_p
        self.e12_p = e12_p
        self.e21_p = e21_p
        self.e22_p = e22_p
        self.e1_mask = e1_mask
        self.e2_mask = e2_mask
        
    def print_contents(self):
        print(self.input_ids,self.input_mask,self.segment_ids, self.label_id,
        self.e11_p,self.e12_p,self.e21_p,
        self.e22_p,self.e1_mask, self.e2_mask)

In [5]:
# Functions for reading in the data:

import csv
import sys 
import logging

logger = logging.getLogger(__name__)

def read_tsv(input_file, quotechar=None):
    """Reads a tab separated value file."""
    with open(input_file, "r", encoding="utf-8-sig") as f:
        reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
        lines = []
        for line in reader:
            if sys.version_info[0] == 2:
                line = list(cell for cell in line)
            lines.append(line)
        return lines
      
def create_examples(lines, set_type):
    """Creates examples for the training and test sets.
  
    $AZATHIOPRINE$ is an immunosuppressive drug that is used to treat #RHEUMATOID ARTHRITIS#	8	treats2	treats1	2
    
    $ denotes first entity, # denotes second entitiy, 8 denotes type of relation and 2 denotes direction
    """
    examples = []
    for (i, line) in enumerate(lines):

        guid = "%s-%s" % (set_type, i)
        logger.info(line)
        text_a = line[1]
        text_b = None
        label = line[2]
        # print(text_a)
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

def get_train_examples(data_dir):
    logger.info("LOOKING AT {}".format(
        os.path.join(data_dir, "train.tsv")))
    return create_examples(
        read_tsv(os.path.join(data_dir, "train.tsv")), "train")
    

def get_test_examples(data_dir):
    return create_examples(
        read_tsv(os.path.join(data_dir, "test.tsv")), "test")

# 2. Read in the data and convert to features

In [6]:
from transformers import WEIGHTS_NAME, BertConfig, BertTokenizer

# Configuration parameters:
use_entity_indicator=True
max_seq_len=176

tokenizer = BertTokenizer.from_pretrained(
        'bert-base-uncased', do_lower_case=True)
# bert-base-uncased
n_labels = 18
labels = [str(i) for i in range(n_labels)]


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# BERT Class for converting the input to features according to the required input form
def convert_examples_to_features(examples, label_list, max_seq_len,
                                 tokenizer,
                                 cls_token='[CLS]',
                                 cls_token_segment_id=1,
                                 sep_token='[SEP]',
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0,
                                 sequence_b_segment_id=1,
                                 mask_padding_with_zero=True):
 
    ''' In: sentences with entities marked by $$ and ## around them
      Out: sentence represented as object of the InputFeature class '''

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        tokens_a = tokenizer.tokenize(example.text_a)
        
        #convert the entity information to features as well
        l = len(tokens_a)
        # print(example.text_a)
        # the start position of entity1:
        e11_p = tokens_a.index("#") + 1  
        # the end position of entity1
        e12_p = l - tokens_a[::-1].index("#") + 1  
        # the start position of entity2
        e21_p = tokens_a.index("$") + 1  
        # the end position of entity2
        e22_p = l - tokens_a[::-1].index("$") + 1 

        tokens_b = None

        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3".
            special_tokens_count = 3
            _truncate_seq_pair(tokens_a, tokens_b,
                               max_seq_len - special_tokens_count)
        else:
            # Account for [CLS] and [SEP] with "- 2" and with "
            special_tokens_count = 2
            if len(tokens_a) > max_seq_len - special_tokens_count:
                tokens_a = tokens_a[:(max_seq_len - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if tokens_b:
            tokens += tokens_b + [sep_token]
            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

        tokens = [cls_token] + tokens
        segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + \
                     ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids = segment_ids + \
                      ([pad_token_segment_id] * padding_length)

        #add attention mask for entities as well
        e1_mask = [0 for i in range(len(input_mask))]

        e2_mask = [0 for i in range(len(input_mask))]

        for i in range(e11_p, e12_p):
            e1_mask[i] = 1
        for i in range(e21_p, e22_p):
            e2_mask[i] = 1

        assert len(input_ids) == max_seq_len
        assert len(input_mask) == max_seq_len
        assert len(segment_ids) == max_seq_len
  
        label_id = int(example.label)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
            logger.info("input_ids: %s" %
                        " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" %
                        " ".join([str(x) for x in input_mask]))
            if use_entity_indicator:
                logger.info("e11_p: %s" % e11_p)
                logger.info("e12_p: %s" % e12_p)
                logger.info("e21_p: %s" % e21_p)
                logger.info("e22_p: %s" % e22_p)
                logger.info("e1_mask: %s" %
                            " ".join([str(x) for x in e1_mask]))
                logger.info("e2_mask: %s" %
                            " ".join([str(x) for x in e2_mask]))
            logger.info("segment_ids: %s" %
                        " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append( InputFeatures(input_ids=input_ids,input_mask=input_mask,e11_p=e11_p,e12_p=e12_p, e21_p=e21_p, e22_p=e22_p,
                          e1_mask=e1_mask,e2_mask=e2_mask, segment_ids=segment_ids,label_id=label_id))
    return features

In [8]:
import os

# Get the training data from the data folder, hosted on google drive:
data_folder = '/content/gdrive/MyDrive/Colab Notebooks/R-BERT/data'
examples = get_train_examples(data_folder)
features = convert_examples_to_features(
    examples, labels, max_seq_len, tokenizer)

*Convert* the features to tensors and make a tensor data set

In [9]:
import torch 
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset

all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor(
    [f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor(
    [f.segment_ids for f in features], dtype=torch.long)

#also for entities
all_e1_mask = torch.tensor(
    [f.e1_mask for f in features], dtype=torch.long)
all_e2_mask = torch.tensor(
    [f.e2_mask for f in features], dtype=torch.long) 

all_label_ids = torch.tensor(
        [f.label_id for f in features], dtype=torch.long)

dataset = TensorDataset(all_input_ids, all_input_mask,
                            all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask)

# 3. Preparing the model

In [10]:
# Configuration parameters:

# batch size (low to save memory):
per_gpu_train_batch_size = 4
n_gpu = torch.cuda.device_count()

# the base BERT model (smaller, to save memory)
pretrained_model_name='bert-base-uncased'

# parameters for gradient descent:
max_steps=-1
gradient_accumulation_steps=1 

# Number of training epochs:
num_train_epochs=5.0

# Name of task for Bert:
task_name = 'semeval'

# hyperparameter for regularization
l2_reg_lambda=5e-3
local_rank=-1
no_cuda=False

train_batch_size = per_gpu_train_batch_size * \
        max(1, n_gpu)

# For sampling during the training:
train_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(
        dataset, sampler=train_sampler, batch_size=train_batch_size)

# total number of steps for training:
t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

# 4. Load the Bert customized for relation extraction (R-Bert)

In [11]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import (BertModel, BertPreTrainedModel, BertTokenizer)
from torch.nn import MSELoss, CrossEntropyLoss

def l2_loss(parameters):
  '''Calculates L2 loss (euclidian length) of 'parameters' vector.'''
  return torch.sum(   torch.tensor([torch.sum(p ** 2) / 2 for p in parameters if p.requires_grad ]))


# Huggingface Transformers Class for BERT Sequence Classification
class BertForSequenceClassification(BertPreTrainedModel):
    """
        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
            Labels for computing the sequence classification/regression loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).

    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Classification (or regression if config.num_labels==1) loss.
        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
            of shape ``(batch_size, sequence_length, hidden_size)``:
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.

    Examples::

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased')
        input_ids = torch.tensor(tokenizer.encode(
            "Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=labels)
        loss, logits = outputs[:2]

    """

    def __init__(self, config):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.l2_reg_lambda = config.l2_reg_lambda
        self.bert = BertModel(config)
        self.latent_entity_typing = config.latent_entity_typing
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        classifier_size = config.hidden_size*3
        self.classifier = nn.Linear(
            classifier_size, self.config.num_labels)
        self.latent_size = config.hidden_size
        self.latent_type = nn.Parameter(torch.FloatTensor(
            3, config.hidden_size), requires_grad=True)

        self.init_weights()

    # Customized forward step, for relation extraction
    # Does the extra steps required, as described in the paper.
    # Enriching Pre-trained Language Model with Entity Information for Relation Classification https://arxiv.org/abs/1905.08284.

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, e1_mask=None, e2_mask=None, labels=None,
                position_ids=None, head_mask=None):

        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                            attention_mask=attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]
        sequence_output = outputs[0]

        def extract_entity(sequence_output, e_mask):
            extended_e_mask = e_mask.unsqueeze(1)
            extended_e_mask = torch.bmm(
                extended_e_mask.float(), sequence_output).squeeze(1)
            return extended_e_mask.float()

        e1_h = extract_entity(sequence_output, e1_mask)
        e2_h = extract_entity(sequence_output, e2_mask)
        context = self.dropout(pooled_output)
        pooled_output = torch.cat([context, e1_h, e2_h], dim=-1)

        # Extra logit layer on top of BERT,  in order to do relation extraction:
        logits = self.classifier(pooled_output)

        # add hidden states and attention
        outputs = (logits,) + outputs[2:]

        device = logits.get_device()
        l2 = l2_loss(self.parameters())

        if device >= 0:
            l2 = l2.to(device)
        loss = l2 * self.l2_reg_lambda
        if labels is not None:

            # transform to plausible probabilities,  between 0 and 1:            
            probabilities = F.softmax(logits, dim=-1)
            log_probs = F.log_softmax(logits, dim=-1)

            # Do one hot encoding:
            one_hot_labels = F.one_hot(labels, num_classes=self.num_labels)
            if device >= 0:
                one_hot_labels = one_hot_labels.to(device)

            # Calculate loss:
            dist = one_hot_labels[:, 1:].float() * log_probs[:, 1:]
            example_loss_except_other, _ = dist.min(dim=-1)
            per_example_loss = - example_loss_except_other.mean()

            rc_probabilities = probabilities - probabilities * one_hot_labels.float()
            second_pre,  _ = rc_probabilities[:, 1:].max(dim=-1)
            rc_loss = - (1 - second_pre).log().mean()

            loss += per_example_loss + 5 * rc_loss

            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

In [12]:
# Make config variable for the model:
bertconfig = BertConfig.from_pretrained(
        pretrained_model_name, num_labels=n_labels, finetuning_task=task_name)

bertconfig.l2_reg_lambda = l2_reg_lambda
bertconfig.latent_entity_typing = False
bertconfig.num_classes = n_labels

# Load the model:
model = BertForSequenceClassification.from_pretrained(
        pretrained_model_name, config=bertconfig)

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# 5. Get ready for training

In [13]:
# Prepare optimizer and schedule (linear warmup and decay)

from transformers import AdamW, get_linear_schedule_with_warmup
# Hyperparameters for the optimizer:
max_grad_norm = 1.0
learning_rate=2e-5
adam_epsilon=1e-8
warmup_steps=0
weight_decay=0.9


no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# Load optimizer and scheduler:
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

# Parallelize in case we have multiple GPUs:
if n_gpu > 1:
    model = torch.nn.DataParallel(model)



In [14]:
# Prepare for trainig:
from tqdm import tqdm, trange
import random
import numpy as np

#  Random seed for reproducability
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(int(num_train_epochs),
                        desc="Epoch", disable=local_rank not in [-1, 0])



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

In [15]:
# put the model to the device
device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

# 6. Train!

In [16]:
# Loops through the training set for a few epochs and backpropagate

# Collect the loss values:
loss_values = []

seed = 123456
set_seed(seed)

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration",
                          disable=local_rank not in [-1, 0])
    
    # For each epoch,  split into batches and train!

    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels':      batch[3],
                  'e1_mask': batch[4],
                  'e2_mask': batch[5],
                  }

        outputs = model(**inputs)
        # model outputs are always tuple in transformers
        
        loss = outputs[0]

        # Collect the loss:
        loss_values.append(loss)
        
        if n_gpu > 1:
            loss = loss.mean()  
            # mean() to average on multi-gpu parallel training
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        
        # Back propagate
        loss.backward()
        torch.nn.utils.clip_grad_norm_(
            model.parameters(), max_grad_norm)

        tr_loss += loss.item()
        if (step + 1) % gradient_accumulation_steps == 0:

            # Take a step! 
            optimizer.step()
            scheduler.step()              
            # Update learning rate schedule
            model.zero_grad()
            global_step += 1

        if max_steps > 0 and global_step > max_steps:
            # We're done!
            epoch_iterator.close()
            break
    if max_steps > 0 and global_step > max_steps:
        # We're done!
        train_iterator.close()
        break


Iteration:   0%|          | 0/138 [00:00<?, ?it/s][A
Iteration:   1%|          | 1/138 [00:02<06:10,  2.70s/it][A
Iteration:   1%|▏         | 2/138 [00:02<02:46,  1.22s/it][A
Iteration:   2%|▏         | 3/138 [00:03<01:39,  1.36it/s][A
Iteration:   3%|▎         | 4/138 [00:03<01:08,  1.96it/s][A
Iteration:   4%|▎         | 5/138 [00:03<00:51,  2.61it/s][A
Iteration:   4%|▍         | 6/138 [00:03<00:40,  3.24it/s][A
Iteration:   5%|▌         | 7/138 [00:03<00:34,  3.84it/s][A
Iteration:   6%|▌         | 8/138 [00:03<00:29,  4.36it/s][A
Iteration:   7%|▋         | 9/138 [00:04<00:26,  4.78it/s][A
Iteration:   7%|▋         | 10/138 [00:04<00:24,  5.14it/s][A
Iteration:   8%|▊         | 11/138 [00:04<00:23,  5.42it/s][A
Iteration:   9%|▊         | 12/138 [00:04<00:22,  5.63it/s][A
Iteration:   9%|▉         | 13/138 [00:04<00:21,  5.75it/s][A
Iteration:  10%|█         | 14/138 [00:04<00:21,  5.85it/s][A
Iteration:  11%|█         | 15/138 [00:04<00:20,  5.97it/s][A
Iteration

# 7. Save / Load model

In [17]:
# # Save the trained model:
# torch.save(model.state_dict(), '/content/gdrive/MyDrive/Colab Notebooks/R-BERT/data/das_model_train2')

In [18]:
# # Load the model, which was made on 8 GPUs (so the state_dict has a different format)
# state_dict = torch.load('/content/gdrive/My Drive/Colab Notebooks/R-BERT/data/das_model_train2')

# # Fix the format on the state_dict:

# # create new OrderedDict that does not contain `module.`
# from collections import OrderedDict
# new_state_dict = OrderedDict()
# for k, v in state_dict.items():
#     name = k[7:] # remove `module.`
#     new_state_dict[name] = v

In [19]:
# device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")


# # Load the saved model from the state dict: 
# model = BertForSequenceClassification.from_pretrained(pretrained_model_name, config=bertconfig)
# model.load_state_dict(new_state_dict)
# model.to(device)

# 8. Evaluate!

In [20]:
# Metrics for evaluation (accuracy, f1 score),  from the official script for SemEval task-8
def acc_and_f1(preds, labels, average='macro'):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds, average=average)
    return {"acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2}
    
def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    return acc_and_f1(preds, labels)

def simple_accuracy(preds, labels):
    return (preds == labels).mean()

In [21]:
# Evaluation

def evaluate(model, tokenizer, prefix=""):
    '''
    Reads the test set, makes predictions on it, saves the predictions
    returns the predictions / truth and accuracy+f1 score.
    '''
    # Loop to handle MNLI double evaluation (matched, mis-matched)

    # What kind of task it was, for BERT:
    eval_task = task_name

    # Save the evaluation metrics into results:
    results = {}

    # Load the test set and convert to features and to tensors:
    examples = get_test_examples('/content/gdrive/My Drive/Colab Notebooks/R-BERT/data')
    features = convert_examples_to_features(
        examples, labels, max_seq_len, tokenizer, "classification", use_entity_indicator)

    all_input_ids = torch.tensor(
            [f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in features], dtype=torch.long)
    all_e1_mask = torch.tensor(
        [f.e1_mask for f in features], dtype=torch.long)  # add e1 mask
    all_e2_mask = torch.tensor(
        [f.e2_mask for f in features], dtype=torch.long)  # add e2 mask

    all_label_ids = torch.tensor(
        [f.label_id for f in features], dtype=torch.long)

    eval_dataset = TensorDataset(all_input_ids, all_input_mask,all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask)

    # Size of batch per GPU:
    eval_batch_size = per_gpu_eval_batch_size * \
        max(1, n_gpu)

    # Sample and load data:
    eval_sampler = SequentialSampler(
        eval_dataset) 
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size)


  # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    # Loop through the test set, batch by batch:

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':      batch[3],
                      'e1_mask': batch[4],
                      'e2_mask': batch[5],
                      }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
      
        #Calcualte the probalities for ROC/AUC
        probabilities = F.softmax(logits, dim=-1)

        # Extract the predictions from the model's output:
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
            
    # Get the loss, prediction and results:
    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)


    result = compute_metrics(eval_task, preds, out_label_ids)
    results.update(result)

    logger.info("***** Eval results {} *****".format(prefix))
    for key in sorted(result.keys()):
        logger.info("  %s = %s", key, str(result[key]))
    
    # Write results to file:
    output_eval_file = "/content/gdrive/My Drive/Colab Notebooks/R-BERT/eval/results2.txt"
    with open(output_eval_file, "w") as writer:
        print(len(preds))
        for key in range(len(preds)):
            writer.write("%d\t%s\n" %  (key, str(RELATION_LABELS[preds[key]-1])))
                
    return result, preds, out_label_ids, probabilities

In [22]:
# # Your call to model.predict() is returning the logits for softmax. This is useful for training purposes.

# # To get probabilties, you need to apply softmax on the logits.

# import torch.nn.functional as F
# logits = model.predict()
# probabilities = F.softmax(logits, dim=-1)

In [23]:
import numpy as np 
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score

RELATION_LABELS = ['left1-left2(e1,e2)',
'right1-right2(e1,e2)',
'l_pin1-l_pin2(e1,e2)',
'r_pin1-r_pin2(e1,e2)']

per_gpu_eval_batch_size=4

result = evaluate(model, tokenizer)
result

Evaluating: 100%|██████████| 7/7 [00:00<00:00, 23.25it/s]


26


({'acc': 0.0, 'f1': 0.0, 'acc_and_f1': 0.0},
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]),
 array([2, 1, 2, 4, 1, 3, 2, 1, 2, 1, 2, 4, 1, 3, 2, 1, 2, 1, 2, 1, 2, 1,
        3, 2, 4, 1]),
 tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],
        device='cuda:0'))

In [None]:
y_pred = result[1]
y2 = result[2]

In [None]:
sum(result[3]])

In [None]:
y_pred = [2, 2, 2, 4, 1, 3, 2, 1, 2, 2, 2, 4, 4, 3, 2, 1, 2, 1, 2, 1, 2, 1, 3, 2, 4, 1]
y2 = [2, 1, 2, 4, 1, 3, 2, 1, 2, 1, 2, 4, 1, 3, 2, 1, 2, 1, 2, 1, 2, 1, 3, 2, 4, 1]
'''({'acc': 0.8846153846153846,
  'f1': 0.924812030075188,
  'acc_and_f1': 0.9047137073452862},
 array([2, 2, 2, 4, 1, 3, 2, 1, 1, 1, 2, 4, 1, 3, 2, 1, 2, 1, 1, 1, 2, 1,
        3, 2, 4, 1]),
 array([2, 1, 2, 4, 1, 3, 2, 1, 2, 1, 2, 4, 1, 3, 2, 1, 2, 1, 2, 1, 2, 1,
        3, 2, 4, 1]),
 tensor([[2.2362e-04, 3.4219e-06, 2.1530e-02, 1.7892e-05, 9.7792e-01, 1.0060e-04,
          3.2767e-05, 5.1960e-06, 1.5300e-05, 7.5195e-06, 1.4496e-05, 3.2002e-05,
          2.7597e-06, 1.8904e-05, 2.2435e-05, 2.6208e-05, 1.7489e-05, 4.8732e-06],
         [1.9342e-10, 1.0000e+00, 8.6637e-12, 1.9156e-09, 5.2488e-10, 1.8394e-10,
          1.4103e-07, 3.7453e-10, 6.8952e-12, 9.2901e-12, 9.7561e-11, 1.8463e-10,
          3.9704e-11, 2.1947e-11, 9.5502e-10, 1.9716e-10, 2.4306e-10, 2.9897e-10]],
        device='cuda:0')) '''

In [None]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y2, y_pred, average='macro')

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
from sklearn.metrics import roc_auc_score

In [None]:
cf_matrix = confusion_matrix(y2, y_pred)
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')

In [None]:
def get_all_roc_coordinates(y_real, y_proba):
    '''
    Calculates all the ROC Curve coordinates (tpr and fpr) by considering each point as a treshold for the predicion of the class.
    
    Args:
        y_real: The list or series with the real classes.
        y_proba: The array with the probabilities for each class, obtained by using the `.predict_proba()` method.
        
    Returns:
        tpr_list: The list of TPRs representing each threshold.
        fpr_list: The list of FPRs representing each threshold.
    '''
    tpr_list = [0]
    fpr_list = [0]
    for i in range(len(y_proba)):
        threshold = y_proba[i]
        y_pred = y_proba >= threshold
        tpr, fpr = calculate_tpr_fpr(y_real, y_pred)
        tpr_list.append(tpr)
        fpr_list.append(fpr)
    return tpr_list, fpr_list

In [None]:
def calculate_tpr_fpr(y_real, y_pred):
    '''
    Calculates the True Positive Rate (tpr) and the True Negative Rate (fpr) based on real and predicted observations
    
    Args:
        y_real: The list or series with the real classes
        y_pred: The list or series with the predicted classes
        
    Returns:
        tpr: The True Positive Rate of the classifier
        fpr: The False Positive Rate of the classifier
    '''
    
    # Calculates the confusion matrix and recover each element
    cm = confusion_matrix(y_real, y_pred)
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]
    
    # Calculates tpr and fpr
    tpr =  TP/(TP + FN) # sensitivity - true positive rate
    fpr = 1 - TN/(TN+FP) # 1-specificity - false positive rate
    
    return tpr, fpr

In [None]:
def plot_roc_curve(tpr, fpr, scatter = True, ax = None):
    '''
    Plots the ROC Curve by using the list of coordinates (tpr and fpr).
    
    Args:
        tpr: The list of TPRs representing each coordinate.
        fpr: The list of FPRs representing each coordinate.
        scatter: When True, the points used on the calculation will be plotted with the line (default = True).
    '''
    if ax == None:
        plt.figure(figsize = (5, 5))
        ax = plt.axes()
    
    if scatter:
        sns.scatterplot(x = fpr, y = tpr, ax = ax)
    sns.lineplot(x = fpr, y = tpr, ax = ax)
    sns.lineplot(x = [0, 1], y = [0, 1], color = 'green', ax = ax)
    plt.xlim(-0.05, 1.05)
    plt.ylim(-0.05, 1.05)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

In [None]:
# Plots the Probability Distributions and the ROC Curves One vs Rest
import matplotlib.pyplot as plt
import pandas as pd
plt.figure(figsize = (12, 8))
bins = [i/20 for i in range(20)] + [1]
classes = [1, 2, 3, 4]
roc_auc_ovr = {}
for i in range(len(classes)):
    # Gets the class
    c = classes[i]
    
    # Prepares an auxiliar dataframe to help with the plots
    df_aux = pd.DataFrame()
    df_aux['class'] = [1 if y == c else 0 for y in y_pred]
    df_aux['prob'] = 
    # df_aux = df_aux.reset_index(drop = True)
    
    # Plots the probability distribution for the class and the rest
    ax = plt.subplot(2, 3, i+1)
    sns.histplot(x = "prob", data = df_aux, hue = 'class', color = 'b', ax = ax, bins = bins)
    ax.set_title(c)
    ax.legend([f"Class: {c}", "Rest"])
    ax.set_xlabel(f"P(x = {c})")
    
    # Calculates the ROC Coordinates and plots the ROC Curves
    ax_bottom = plt.subplot(2, 3, i+4)
    tpr, fpr = get_all_roc_coordinates(df_aux['class'], df_aux['prob'])
    plot_roc_curve(tpr, fpr, scatter = False, ax = ax_bottom)
    ax_bottom.set_title("ROC Curve OvR")
    
    # Calculates the ROC AUC OvR
    roc_auc_ovr[c] = roc_auc_score(df_aux['class'], df_aux['prob'])
plt.tight_layout()

In [None]:
df_aux

In [None]:
# roc curve and auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

In [None]:
auc = roc_auc_score(y2, y_pred,  multi_class='ovr')

In [None]:
import numpy as np 
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score

RELATION_LABELS = ['left1-left2(e1,e2)',
'right1-right2(e1,e2)',
'l_pin1-l_pin2(e1,e2)',
'r_pin1-r_pin2(e1,e2)',]

per_gpu_eval_batch_size=4

result = evaluate(model, tokenizer)
result

# Turns out that the model predicts over half of the classes correctly!


Results of the evaluation:


accuracy: 

f1-score (macro average): 

Check what the predictions were, by running through the test file sentence by sentence:

In [None]:
# dict that relates the relation name and how it appears in the text:

RELATIONZ = {
'left1-left2(e1,e2)': '1 left1 left2 2',
'right1-right2(e1,e2)': '1 right1 right2 2',
'l_pin1-l_pin2(e1,e2)': '1 l_pin1 l_pin2 2',
'r_pin1-r_pin2(e1,e2)': '1 r_pin1 rpin2 2'}

In [None]:
predictions = []
with open('/content/gdrive/My Drive/Colab Notebooks/R-BERT/eval/results2.txt') as f:
  for l in f.readlines():
    predictions.append(l.split('	')[1].strip())

In [None]:
# Check which predictions were correctly done by the model:

with open('/content/gdrive/My Drive/Colab Notebooks/R-BERT/data/test.tsv') as f:
  correct = set() 
  i = 0 
  for l in f.readlines():
    if RELATIONZ[predictions[i]] in l[-30:]:
        print(predictions[i])
        print(l[6:])
        correct.add((l,predictions[i]))
    i+=1

It seems that the model only catches the causal relationships!

In [None]:
from collections import Counter

Counter([x[1] for x in correct])

What was the distribution of relationships in the training data?



In [None]:
train_data = read_tsv('/content/gdrive/My Drive/Notebooks/R-BERT/data/train.tsv')

Counter([(x[3],x[4]) for x in train_data])

What about the distribution of relationships in the test data?

In [None]:
test_data = read_tsv('/content/gdrive/My Drive/Notebooks/R-BERT/data/test.tsv')

Counter([(x[3],x[4]) for x in test_data])

The model only identifies 'treats' twice correctly, even though it is almost as abundant as 'causes'...

What are 'treats' cases classified as?

In [None]:
treats = []
with open('/content/gdrive/My Drive/Notebooks/R-BERT/data/test.tsv') as f:
  
  for i,l in enumerate(f.readlines()):
    if 'treats' in l[-30:]:
      # it is a "treats" relation 
        treats.append(predictions[i])
        if predictions[i][:7] == 'treats2' or predictions[i][:7] == 'treats1':
          # it is predicted to be a treats relation
          print(l)
          
Counter(treats)

# Confusion matrices, precision and recall for the 10 relations

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

# Grab the true predictions:

truths = []
with open('/content/gdrive/My Drive/Notebooks/R-BERT/data/test.tsv') as f:
    for l in f.readlines():
      found = False 
      for k,v in RELATIONZ.items():
        if v in l:
          truths.append(k) 

In [None]:
confusion = multilabel_confusion_matrix(truths,predictions)

for i,c in enumerate(confusion):
  print(sorted(list(RELATIONZ.keys()))[i])
  print(c)

The confusion matrices show that the accuracy of classification for each individual relation is pretty bad. 

What are the precision and recall for the 10 classes/relations?

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision,recall,_,_ = precision_recall_fscore_support(truths,predictions)

In [None]:

print('Precision    ', 'Recall')
print()
for i,p in enumerate(precision):
  print(sorted(list(RELATIONZ.keys()))[i])
  print('%.4f' % p, '      ', '%.4f' % recall[i])
  print()

# References

J.  Devlin,  M.-W.  Chang,  K.  Lee,  and  K.  Toutanova,  “Bert.”https://github.com/google-research/bert, 2018.  


T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac,T. Rault, R. Louf, M. Funtowicz, and J. Brew, “Huggingface's transformers.”https://github.com/huggingface/transformers, 2019.


H.   Wang,    “bert-relation-classification.”https://github.com/wang-h/bert-relation-classification, 2019.

In [None]:
s = "This 72 year old man attended the clinic for a routine follow up visit.   He underwent cataract surgery in his left eye and his vision has improved  Diagnosis  RE Cataract  LE Pseudophakia  RVA 6/36 ph 6/24 LVA 6/6  He has been listed for right phakoemulsification and iol and will be admitted on 8 Apr 2013."
list_of_words = s.split()
next_word = list_of_words[list_of_words.index('6/36') - 1]

In [None]:
next_word