In [1]:
!pip install -q transformers
!pip install -q seqeval
!pip install -q madgrad

[K     |████████████████████████████████| 2.3MB 28.1MB/s 
[K     |████████████████████████████████| 3.3MB 40.1MB/s 
[K     |████████████████████████████████| 901kB 42.7MB/s 
[K     |████████████████████████████████| 51kB 6.7MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/Ryzhtus/master-thesis

Cloning into 'master-thesis'...
remote: Enumerating objects: 2362, done.[K
remote: Counting objects: 100% (2362/2362), done.[K
remote: Compressing objects: 100% (2232/2232), done.[K
remote: Total 2362 (delta 234), reused 2254 (delta 126), pack-reused 0[K
Receiving objects: 100% (2362/2362), 3.57 MiB | 20.21 MiB/s, done.
Resolving deltas: 100% (234/234), done.


In [3]:
cd master-thesis

/content/master-thesis


In [4]:
import os
import collections

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import random

import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from named_entity_recognition.utils import create_dataset_and_document_dataloader
from named_entity_recognition.train_context import train_model, test_model

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

SEED = 42

"""torch.manual_seed(SEED)
random.seed(SEED)
numpy.random.seed(SEED)"""

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [5]:
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
DEVICE = 'cuda' if torch.cuda.is_available else 'cpu'
EPOCHS = 4
BATCH_SIZE = 32

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




In [6]:
train_dataset, train_documents, train_dataloader = create_dataset_and_document_dataloader('conll', "data/conll2003/train.txt", batch_size=BATCH_SIZE, shuffle=True, tokenizer=TOKENIZER)
eval_dataset, eval_documents, eval_dataloader = create_dataset_and_document_dataloader('conll', "data/conll2003/valid.txt", batch_size=BATCH_SIZE, shuffle=True, tokenizer=TOKENIZER)
test_dataset, test_documents, test_dataloader = create_dataset_and_document_dataloader('conll', "data/conll2003/test.txt", batch_size=BATCH_SIZE, shuffle=False, tokenizer=TOKENIZER)

In [7]:
eval_dataset.idx2tag = train_dataset.idx2tag
eval_dataset.tag2idx = train_dataset.tag2idx
test_dataset.idx2tag = train_dataset.idx2tag
test_dataset.tag2idx = train_dataset.tag2idx

In [None]:
class BertNER(nn.Module):
    def __init__(self, num_classes):
        super(BertNER, self).__init__()
        self.embedding_dim = 768
        self.num_classes = num_classes

        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.linear = nn.Linear(self.embedding_dim, self.num_classes)

    def forward(self, tokens):
        embeddings = self.bert(tokens)[0]
        predictions = self.linear(embeddings)

        return predictions

In [None]:
class ContextBertNER(nn.Module):
    def __init__(self, num_classes, device):
        super(ContextBertNER, self).__init__()
        self.embedding_dim = 768
        self.num_classes = num_classes
        self.device = device

        self.bert = BertModel.from_pretrained("bert-base-cased", output_hidden_states=True)
        self.lstm = nn.LSTM(self.embedding_dim * 2, self.embedding_dim, bidirectional=True)
        self.linear = nn.Linear(self.embedding_dim * 2, self.num_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, batch):
        last_hidden_state = self.bert(batch)[0]

        result_hidden_state = last_hidden_state.clone()
        result_hidden_state = result_hidden_state.detach().cpu()

        additional_context = torch.zeros_like(result_hidden_state, requires_grad=False)

        token2embedding = {}

        for batch_element_id, tokens in enumerate(batch):
            for token_id, token in enumerate(tokens):
                token = token.item()
                if token in token2embedding.keys():
                    token2embedding[token].append(result_hidden_state[batch_element_id][token_id])
                else:
                    token2embedding[token] = [result_hidden_state[batch_element_id][token_id]]

        for token in token2embedding.keys():
            token2embedding[token] = torch.mean(torch.stack(token2embedding[token]))

        for batch_element_id, tokens in enumerate(batch):
            for token_id, token in enumerate(tokens):
                token = token.item()
                additional_context[batch_element_id][token_id] = token2embedding[token]

        additional_context = additional_context.to(self.device)
        hidden_state_with_context = torch.cat((last_hidden_state, additional_context), 2)

        predictions = self.lstm(hidden_state_with_context)[0]
        predictions = self.dropout(predictions)
        predictions = self.linear(predictions)

        return predictions

In [8]:
class DocumentContextBertNER(nn.Module):
    def __init__(self, num_classes, device):
        super(DocumentContextBertNER, self).__init__()
        self.embedding_dim = 768
        self.num_classes = num_classes
        self.device = device

        self.bert = BertModel.from_pretrained("bert-base-cased", output_hidden_states=True)
        self.lstm = nn.LSTM(self.embedding_dim * 2, self.embedding_dim, bidirectional=True)
        self.linear = nn.Linear(self.embedding_dim * 2, self.num_classes)
        self.dropout = nn.Dropout(0.1)

    def get_document_context(self, document):
        last_hidden_state = self.bert(document)[0]

        result_hidden_state = last_hidden_state.clone()
        result_hidden_state = result_hidden_state.detach().cpu()

        token2embedding = {}

        for document_element_id, tokens in enumerate(document):
            for token_id, token in enumerate(tokens):
                token = token.item()
                if token in token2embedding.keys():
                    token2embedding[token].append(result_hidden_state[document_element_id][token_id])
                else:
                    token2embedding[token] = [result_hidden_state[document_element_id][token_id]]

        for token in token2embedding.keys():
            token2embedding[token] = torch.mean(torch.stack(token2embedding[token]), dim=0)
          
        return token2embedding

    def forward(self, batch, documents_ids, mean_embeddings):
        last_hidden_state = self.bert(batch)[0]

        result_hidden_state = last_hidden_state.clone()
        result_hidden_state = result_hidden_state.detach().cpu()

        additional_context = torch.zeros_like(result_hidden_state, requires_grad=False)

        for batch_element_id, tokens in enumerate(batch):
            document_id = documents_ids[batch_element_id]
            for token_id, token in enumerate(tokens):
                token = token.item()
                additional_context[batch_element_id][token_id] = mean_embeddings[document_id][token]

        additional_context = additional_context.to(self.device)
        hidden_state_with_context = torch.cat((last_hidden_state, additional_context), 2)

        predictions = self.lstm(hidden_state_with_context)[0]
        predictions = self.dropout(predictions)
        predictions = self.linear(predictions)

        return predictions

## Non-Shuffled Sentences Batches

In [None]:
classes = len(train_dataset.ner_tags)

model = DocumentContextBertNER(classes, DEVICE).to(DEVICE)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0).to(DEVICE)

EPOCHS = 4

#train_epoch(model, criterion, optimizer, train_dataloader, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, documents=train_documents, scheduler=None, name='Train:')
train_model(model, criterion, optimizer, train_dataloader, eval_dataloader, train_documents, eval_documents, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, None, EPOCHS)
test_model(model, criterion, test_dataloader, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, test_documents)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[1 / 4] Train: Loss = 0.20731, F1-score = 84.66%, Repeated Entities Accuracy = 78.59%: 100%|██████████| 439/439 [23:57<00:00,  3.28s/it]
[1 / 4] Eval : Loss 

In [None]:
classes = len(train_dataset.ner_tags)

model = DocumentContextBertNER(classes, DEVICE).to(DEVICE) 
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0).to(DEVICE)

EPOCHS = 4

#train_epoch(model, criterion, optimizer, train_dataloader, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, documents=train_documents, scheduler=None, name='Train:')
train_model(model, criterion, optimizer, train_dataloader, eval_dataloader, train_documents, eval_documents, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, False, None, EPOCHS)
test_model(model, criterion, test_dataloader, train_dataset.tag2idx, train_dataset.idx2tag, DEVICE, test_documents)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[1 / 4] Train: Loss = 0.96533, F1-score = 0.00%, Repeated Entities Accuracy = 0.00%:   1%|▏         | 6/439 [00:17<20:28,  2.84s/it]