In [1]:
!pip install -q transformers
!pip install -q sentencepiece
!pip install -q seqeval
!pip install -q comet_ml
!pip install -q truecase
!pip install -q pytorch-lightning

[K     |████████████████████████████████| 2.6 MB 8.4 MB/s 
[K     |████████████████████████████████| 636 kB 34.6 MB/s 
[K     |████████████████████████████████| 3.3 MB 47.9 MB/s 
[K     |████████████████████████████████| 895 kB 40.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 8.1 MB/s 
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 282 kB 7.0 MB/s 
[K     |████████████████████████████████| 529 kB 55.3 MB/s 
[K     |████████████████████████████████| 54 kB 3.7 MB/s 
[K     |████████████████████████████████| 52 kB 1.5 MB/s 
[?25h  Building wheel for configobj (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 28.4 MB 51 kB/s 
[K     |████████████████████████████████| 915 kB 8.4 MB/s 
[K     |████████████████████████████████| 5.6 MB 46.0 MB/s 
[K     |████████████████████████████████| 829 kB 72.3 MB/s 
[K     |████████████

In [2]:
!git clone https://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@github.com/Ryzhtus/master-thesis

Cloning into 'master-thesis'...
remote: Enumerating objects: 2958, done.[K
remote: Counting objects: 100% (2958/2958), done.[K
remote: Compressing objects: 100% (2632/2632), done.[K
remote: Total 2958 (delta 693), reused 2587 (delta 322), pack-reused 0[K
Receiving objects: 100% (2958/2958), 3.79 MiB | 8.76 MiB/s, done.
Resolving deltas: 100% (693/693), done.


In [3]:
cd master-thesis/

/content/master-thesis


In [4]:
import os
import nltk
nltk.download('punkt')

from transformers import BertTokenizer, T5Tokenizer, get_linear_schedule_with_warmup
import torch
import numpy as np
import random

import torch.optim as optim
import torch.nn as nn

import torch.utils.data
import torch.optim.optimizer

from pytorch_lightning import LightningModule
from transformers import BertModel
from typing import List, Dict
from ner.utils import clear_for_metrics
from ner.document import Document
from seqeval.metrics import f1_score

from tqdm import tqdm

from ner.utils import create_dataset_and_document_dataloader, create_dataset_and_standard_dataloader

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

SEED = 693

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
# TOKENIZER = T5Tokenizer.from_pretrained('t5-base')
DEVICE = 'cuda' if torch.cuda.is_available else 'cpu'
BATCH_SIZE = 16

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




### CoNLL

In [6]:
train_dataset, train_documents, train_dataloader = create_dataset_and_document_dataloader('conll', 'data/conll2003/train.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)
eval_dataset, eval_documents, eval_dataloader = create_dataset_and_document_dataloader('conll', 'data/conll2003/valid.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)
test_dataset, test_documents, test_dataloader = create_dataset_and_document_dataloader('conll', 'data/conll2003/test.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)

In [None]:
train_dataset, train_dataloader = create_dataset_and_standard_dataloader('T5', 'conll', 'data/conll2003/train.txt', batch_size=BATCH_SIZE, shuffle=True, tokenizer=TOKENIZER)
eval_dataset, eval_dataloader = create_dataset_and_standard_dataloader('T5', 'conll', 'data/conll2003/valid.txt', batch_size=BATCH_SIZE, shuffle=True, tokenizer=TOKENIZER)
test_dataset, test_dataloader = create_dataset_and_standard_dataloader('T5', 'conll', 'data/conll2003/test.txt', batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)

## Model

In [13]:
class LightningBERT(LightningModule):
    def __init__(self, classes, idx2tag, train_documents=None, eval_documents=None, test_documents=None):
        super().__init__()
        # model
        self.classes = classes
        self.idx2tag = idx2tag
        self.model = BertModel.from_pretrained('bert-base-cased')
        self.linear = nn.Linear(768, self.classes)
        self.dropout = nn.Dropout(0.1)

        # other args
        self.train_documents = train_documents
        self.eval_documents = eval_documents
        self.test_documents = test_documents

        # make variables for storing true and pred labels from each batch
        self.epoch_true_labels = []
        self.epoch_pred_labels = []

        self.criterion = nn.CrossEntropyLoss(ignore_index=-100)

    def forward(self, input_ids, attention_mask=None):
        last_hidden_state = self.model(input_ids=input_ids, attention_mask=attention_mask)[0]

        predictions = self.dropout(last_hidden_state)
        predictions = self.linear(predictions)

        return predictions

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=2e-5)
        return optimizer

    def __step(self, input_ids: torch.Tensor, labels: torch.Tensor, attention_mask: torch.Tensor, words_ids: List[List[int]],
            document_ids: List[int] = None, sentences_ids: List[int] = None, document_word_embeddings: Dict = None, word_positions: Dict = None,
            freeze_bert: bool = False):
      
        if freeze_bert:
            for param in self.model.bert.parameters():
                param.requires_grad = False

        if document_ids and sentences_ids and document_word_embeddings and word_positions:
            predictions = self.forward(input_ids, attention_mask, document_ids, sentences_ids, document_word_embeddings,
                                    word_positions)
        else:
            predictions = self.forward(input_ids, attention_mask)

        loss = self.criterion(predictions.view(-1, predictions.shape[-1]), labels.view(-1))

        predictions = predictions.argmax(dim=2).cpu().numpy()
        labels = labels.cpu().numpy()

        # clear <PAD>, CLS and SEP tags from both labels and predictions
        clear_labels, clear_predictions = clear_for_metrics(labels, predictions, self.idx2tag, words_ids)

        return loss, clear_labels, clear_predictions

    def __get_document_word_vectors(self, document_ids: List[int], documents: Document):
        with torch.no_grad():
            document_word_embeddings = {}
            # variable for each word's positions in each document in sentence order
            word_positions = {}

            for document_id in set(document_ids):
                document_word_embeddings[document_id] = self.model.get_document_context(
                    documents[document_id].to(self.device), documents.collect_all_positions_for_each_word(document_id))
                word_positions[document_id] = documents.get_document_words_by_sentences(document_id)

        return document_word_embeddings, word_positions

    def training_step(self, batch, _):
        input_ids, labels, attention_mask, words_ids, _, _ = batch

        loss, true_labels, pred_labels = self.__step(input_ids, labels, attention_mask, words_ids)

        self.epoch_true_labels += true_labels
        self.epoch_pred_labels += pred_labels

        metric = f1_score(self.epoch_true_labels, self.epoch_pred_labels)

        self.log('Training Batch Step Span F1', metric, prog_bar=True)

        return loss

    def training_epoch_end(self, _):
        #epoch_metric = f1_score(self.epoch_true_labels, self.epoch_pred_labels)

        #self.log('Training Epoch Span F1', epoch_metric, prog_bar=True)

        self.epoch_true_labels = []
        self.epoch_pred_labels = []

    def validation_step(self, batch, _):
        input_ids, labels, attention_mask, words_ids, _, _ = batch

        loss, true_labels, pred_labels = self.__step(input_ids, labels, attention_mask, words_ids)

        self.epoch_true_labels += true_labels
        self.epoch_pred_labels += pred_labels

        metric = f1_score(self.epoch_true_labels, self.epoch_pred_labels)

        self.log('Validation Batch Step Span F1', metric, prog_bar=True)
        self.log("Validation Loss", loss, prog_bar=True)

        return loss

    def validation_epoch_end(self, _):
        epoch_metric = f1_score(self.epoch_true_labels, self.epoch_pred_labels)

        self.log('Validation Epoch Span F1', epoch_metric, prog_bar=True)

        self.epoch_true_labels = []
        self.epoch_pred_labels = []

In [14]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping 

model = LightningBERT(len(train_dataset.entity_tags), train_dataset.idx2tag)

early_stop_callback = EarlyStopping(
    monitor="Validation Loss",
    min_delta=0.0,
    patience=2,
    verbose=True,
    mode="min" 
)

trainer = Trainer(
    gpus=1,
    checkpoint_callback=False,
    accumulate_grad_batches=10,
    max_epochs=5,
    callbacks=[early_stop_callback],
    val_check_interval=0.5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [None]:
trainer.fit(model, train_dataloader, eval_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | BertModel        | 108 M 
1 | linear    | Linear           | 6.9 K 
2 | dropout   | Dropout          | 0     
3 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.269   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Metric Validation Loss improved. New best score: 0.174


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…