In [1]:
!pip -qq install transformers

[K     |████████████████████████████████| 1.4MB 8.6MB/s 
[K     |████████████████████████████████| 2.9MB 25.4MB/s 
[K     |████████████████████████████████| 890kB 44.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
!git clone https://github.com/Ryzhtus/master-thesis

Cloning into 'master-thesis'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 17 (delta 1), reused 17 (delta 1), pack-reused 0[K
Unpacking objects: 100% (17/17), done.


In [3]:
cd master-thesis

/content/master-thesis


In [5]:
import warnings
warnings.filterwarnings('ignore')

In [40]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


class ConLL2003Dataset(Dataset):
    def __init__(self, sentences, tags, tags_number, tokenizer):
        self.sentences = sentences
        self.sentences_tags = tags
        self.tags_number = tags_number

        self.tokenizer = tokenizer

        self.ner_tags = ['<PAD>'] + list(set(tag for tag_list in self.sentences_tags for tag in tag_list))
        self.tag2idx = {tag: idx for idx, tag in enumerate(self.ner_tags)}
        self.idx2tag = {idx: tag for idx, tag in enumerate(self.ner_tags)}


    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        words = self.sentences[item]
        tags = self.sentences_tags[item]

        tokens = []
        for word in words:
            if word not in ('[CLS]', '[SEP]'):
                tokens.extend(self.tokenizer.tokenize(word))
        
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        tags = tags + ['<PAD>'] * (len(tokens) - len(tags))
        tags_ids = [self.tag2idx[tag] for tag in tags]

        length = len(tags_ids)

        return torch.LongTensor(tokens_ids), torch.LongTensor(tags_ids)

    def paddings(self, batch):
        tokens, tags = list(zip(*batch))

        tokens = pad_sequence(tokens, batch_first=True)
        tags = pad_sequence(tags, batch_first=True)

        return tokens, tags

def read_data(filename):
    rows = open(filename, 'r').read().strip().split("\n\n")
    sentences, sentences_tags = [], []

    for sentence in rows:
        words = [line.split()[0] for line in sentence.splitlines()]
        tags = [line.split()[-1] for line in sentence.splitlines()]
        sentences.append(words)
        sentences_tags.append(tags)

    tags_number = sum([len(tag) for tag in sentences_tags])

    return sentences, sentences_tags, tags_number


def create_dataset_and_dataloader(filename, batch_size, tokenizer):
    sentences, tags, tags_number = read_data(filename)
    dataset = ConLL2003Dataset(sentences, tags, tags_number, tokenizer)

    return dataset, DataLoader(dataset, batch_size, num_workers=4, collate_fn=dataset.paddings)

In [31]:
from transformers import BertTokenizer

TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
DEVICE = 'cuda' if torch.cuda.is_available else 'cpu'
EPOCHS = 4
BATCH_SIZE = 16

In [28]:
sentences, tags, tags_number = read_data("/content/master-thesis/data/conll2003/train.txt")
dataset = ConLL2003Dataset(sentences, tags, tags_number, TOKENIZER)

In [42]:
train_dataset, train_dataloader = create_dataset_and_dataloader("/content/master-thesis/data/conll2003/train.txt", 
                                                                BATCH_SIZE, TOKENIZER)
train_tags_number = train_dataset.tags_number

eval_dataset, eval_dataloader = create_dataset_and_dataloader("/content/master-thesis/data/conll2003/valid.txt", 
                                                              BATCH_SIZE, TOKENIZER)
eval_tags_number = eval_dataset.tags_number

test_dataset, test_dataloader = create_dataset_and_dataloader("/content/master-thesis/data/conll2003/test.txt", 
                                                              BATCH_SIZE, TOKENIZER)
test_tags_number = test_dataset.tags_number

In [48]:
from sklearn.metrics import f1_score

def calculate_score(predict_tags, correct_tags):
    predicted_labels = list(predict_tags.cpu().numpy())
    correct_labels = list(correct_tags.cpu().numpy())

    predicted_labels_without_mask = []
    correct_labels_without_mask = []
    for p, c in zip(predicted_labels, correct_labels):
        if c > 1:
            predicted_labels_without_mask.append(p)
            correct_labels_without_mask.append(c)

    return f1_score(correct_labels_without_mask, predicted_labels_without_mask, average="micro")

In [55]:
import torch
import torch.nn as nn

def train_epoch(model, criterion, optimizer, data, indexer, device):
    epoch_loss = 0
    epoch_score = 0

    model.train()

    for batch in data:
        tokens = batch[0].to(device)
        tags = batch[1].to(device)

        predictions = model(tokens)
        predictions = predictions.view(-1, predictions.shape[-1])
        tags_mask = tags != indexer['<PAD>']
        tags_mask = tags_mask.view(-1)
        labels = torch.where(tags_mask, tags.view(-1), torch.tensor(criterion.ignore_index).type_as(tags))

        loss = criterion(predictions, labels)

        predictions = predictions.argmax(dim=1, keepdim=True)

        f_score = calculate_score(predictions, labels)

        epoch_loss += loss.item()
        epoch_score += f_score

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        torch.cuda.empty_cache()

    print('Train Loss = {:.5f}, F1-score = {:.3%}'.format(epoch_loss / len(data), epoch_score / len(data)))


def eval_epoch(model, criterion, data, indexer, device):
    epoch_loss = 0
    epoch_score = 0

    model.eval()

    with torch.no_grad():
        for batch in data:
            tokens = batch[0].to(device)
            tags = batch[1].to(device)

            predictions = model(tokens)
            predictions = predictions.view(-1, predictions.shape[-1])
            tags_mask = tags != indexer['<PAD>']
            tags_mask = tags_mask.view(-1)
            labels = torch.where(tags_mask, tags.view(-1), torch.tensor(criterion.ignore_index).type_as(tags))

            loss = criterion(predictions, labels)

            predictions = predictions.argmax(dim=1, keepdim=True)

            f_score = calculate_score(predictions, labels)

            epoch_loss += loss.item()
            epoch_score += f_score

    print('Test Loss = {:.5f}, F1-score = {:.3%}'.format(epoch_loss / len(data), epoch_score / len(data)))


def train_model(model, criterion, optimizer, train_data, eval_data, indexer, device, epochs=1):
    for epoch in range(epochs):
        print('Epoch {} / {}'.format(epoch + 1, epochs))
        train_epoch(model, criterion, optimizer, train_data, indexer, device)
        eval_epoch(model, criterion, eval_data, indexer, device)

In [56]:
from transformers import BertModel
import torch.nn as nn

class BertNER(nn.Module):
    def __init__(self, num_classes):
        super(BertNER, self).__init__()
        self.embedding_dim = 768
        self.num_classes = num_classes

        self.bert = BertModel.from_pretrained("bert-base-cased")
        self.linear = nn.Linear(self.embedding_dim, self.num_classes)

    def forward(self, tokens):
        embeddings = self.bert(tokens)[0]
        predictions = self.linear(embeddings)

        return predictions

In [57]:
import torch.optim as optim
from transformers import AdamW

classes = len(dataset.ner_tags)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = BertNER(classes).to(device)

optimizer = AdamW(model.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)

train_model(model, criterion, optimizer, train_dataloader, eval_dataloader, train_dataset.tag2idx, device, 4)

Epoch 1 / 4
Train Loss = 0.48028, F1-score = 86.706%
Test Loss = 0.38025, F1-score = 89.142%
Epoch 2 / 4
Train Loss = 0.34695, F1-score = 89.769%
Test Loss = 0.32511, F1-score = 90.414%
Epoch 3 / 4
Train Loss = 0.29477, F1-score = 90.893%
Test Loss = 0.29785, F1-score = 90.630%
Epoch 4 / 4
Train Loss = 0.27032, F1-score = 91.441%
Test Loss = 0.31808, F1-score = 90.294%
