# Named entity recognition

## Utilities for training

### Get device

In [1]:
from torch.cuda import is_available as is_cuda_available
from torch.backends.mps import is_available as is_mps_available

def get_device() -> str:
    '''Returns device string with priority: cuda, mps and cpu'''
    if is_cuda_available():
        return 'cuda'
    if is_mps_available():
        return 'mps'
    return 'cpu'


### Training and evaluating utilities

In [2]:
import torch
from time import time
from torch.nn import Module
from torch.optim import Optimizer
from torch.utils.data import IterableDataset

def _train_epoch(device: str, model: Module, data: IterableDataset, loss_fn: Module, optimizer: Optimizer):
    '''Trains model for one epoch'''
    batch_amount = len(data)
    begin_time = time()
    total_loss = 0

    model.train()

    correct = 0
    total_amount = 0

    for index, (x, y) in enumerate(data, 1):
        x = x.to(device)
        y = y.to(device)

        out = model(x)

        total_amount += torch.count_nonzero(y).item()
        for i, j in zip(torch.argmax(out, dim=1), y):
            if i == j and i != 0:
                correct += 1

        loss = loss_fn(out, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        elapsed = time() - begin_time
        total_loss += loss.item()

        avarage_time = elapsed / index
        average_loss = total_loss / index

        remaining = avarage_time * (batch_amount - index)

        print(f'Train: {index}/{batch_amount} | Eta: {remaining:>0.0f}s', end=' ')
        print(f'| loss: {average_loss:>0.4f} - acc: {(correct / total_amount):>0.4f}', end='\r')

        if index == batch_amount:
            print()


def evaluate_model(device: str, model: Module, data: IterableDataset, loss_fn: Module):
    '''Evaluates model with data'''
    batch_amount = len(data)
    begin_time = time()
    total_loss = 0

    model.eval()

    correct = 0
    total_amount = 0

    with torch.no_grad():
        for index, (x, y) in enumerate(data, 1):
            x = x.to(device)
            y = y.to(device)

            out = model(x)

            total_amount += torch.count_nonzero(y).item()
            for i, j in zip(torch.argmax(out, dim=1), y):
                if i == j and i != 0:
                    correct += 1

            elapsed = time() - begin_time
            total_loss += loss_fn(out, y).item()

            avarage_time = elapsed / index
            average_loss = total_loss / index

            remaining = avarage_time * (batch_amount - index)

            print(f'Valid {index}/{batch_amount} | Eta: {remaining:>0.0f}s', end=' ')
            print(f'| loss: {average_loss:>0.4f} - acc: {(correct / total_amount):>0.4f}', end='\r')



    loss = total_loss / batch_amount
    print(f'Valid {batch_amount}/{batch_amount} | Eta: {0}s', end=' ')
    print(f'| loss: {loss:>0.4f} - acc: {(correct / total_amount):>0.4f}')


def train(device: str, model: Module, data: IterableDataset, valid: IterableDataset, loss_fn: Module, optimizer: Optimizer, epochs: int):
    '''Trains model'''
    for epoch in range(epochs):
        print(f'Epoch {epoch + 1}/{epochs}')
        _train_epoch(device, model, data, loss_fn, optimizer)
        evaluate_model(device, model, valid, loss_fn)


## Loading data

### Constants

In [3]:
BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/NamedEntityRecognition'
TRAIN_DIR = 'train'
VALID_DIR = 'dev-0'
TEST_DIR = 'test-A'


### Create vocabulary utility

In [4]:
import torch
from torch import Tensor
from collections import Counter
from torchtext.vocab import vocab, Vocab

TAGS = {
    'O': 0, 'B-ORG': 1, 'I-ORG': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5,
    'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8
}

TAGS_REV = {
    0: 'O', 1: 'B-ORG', 2: 'I-ORG', 3: 'B-PER', 4: 'I-PER', 5: 'B-LOC',
    6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'
}

def create_vocabulary(data) -> Vocab:
    '''Creates vocabulary from dataset'''
    counter = Counter()
    for text in data:
        counter.update(text.split())

    vocabulary = vocab(counter, 1, ['<unk>'])
    vocabulary.set_default_index(0)

    return vocabulary


def encode(vocab: Vocab, text: str) -> Tensor:
    '''Encode word using vocabulary'''
    return torch.tensor([vocab[token] for token in text.split()], dtype=torch.long)


def encode_tag(text: str) -> Tensor:
    '''Encode tag'''
    return torch.tensor([TAGS[token] for token in text.split()], dtype=torch.long)


### Load and basic prepare

In [5]:
from os import path


def load_data(dir: str, vocab: Vocab) -> tuple[list[Tensor], list[Tensor]]:
    '''Loads data from folder'''
    labels = []
    data = []
    with open(path.join(dir, 'in.tsv'), encoding='utf8') as data_file:
        for line in data_file:
            line = line.split('\t')
            if len(line) == 2:
                labels.append(encode_tag(line[0].strip()))
                data.append(encode(vocab, line[1]))
            else:
                data.append(encode(vocab, line[0]))
    if path.exists(path.join(dir, 'expected.tsv')):
        with open(path.join(dir, 'expected.tsv'), encoding='utf8') as label_file:
            labels = [encode_tag(line.strip()) for line in label_file]
    return data, labels


### Custom dataset

In [6]:
from typing import Any, Iterator
from torch.utils.data import IterableDataset


class VariableLengthDataset(IterableDataset):
    '''Dataset that works with variable length inputs'''
    def __init__(self, data: list[Tensor], labels: Tensor) -> None:
        self.data = list(filter(lambda x: x[0].size() != torch.Size([0]), zip(data, labels)))

    def __getitem__(self, index) -> tuple[Tensor, Any]:
        return self.data[index]

    def __len__(self) -> int:
        return len(self.data)

    def __iter__(self) -> Iterator[tuple[Tensor, Any]]:
        for item in self.data:
          yield item


### Loading and encoding data

In [7]:
vocabulary = create_vocabulary(map(lambda x: x.split('\t')[1], open(path.join(BASE_PATH, TRAIN_DIR, 'in.tsv'), encoding='utf8')))


In [8]:
train_data, train_labels = load_data(path.join(BASE_PATH, TRAIN_DIR), vocabulary)
valid_data, valid_labels = load_data(path.join(BASE_PATH, VALID_DIR), vocabulary)


In [9]:
train_data = VariableLengthDataset(train_data, train_labels)
valid_data = VariableLengthDataset(valid_data, valid_labels)


## Defining model

In [10]:
DEVICE = get_device()
print(DEVICE)


cuda


In [11]:
EMBEDDING_DIM = 256
LSTM_DIM = 256
LSTM_LAYERS = 5


In [12]:
from torch.nn import Linear, Module, ReLU, Embedding, LSTM, Dropout
import torch.nn.functional as functional


class NeuralNetwork(Module):
    def __init__(self, vocab: Vocab):
        super().__init__()
        self.embedding = Embedding(len(vocab), EMBEDDING_DIM, padding_idx=0)
        self.lstm = LSTM(EMBEDDING_DIM, LSTM_DIM, num_layers=LSTM_LAYERS, batch_first=True, bidirectional=True)
        self.dropout = Dropout(0.3)
        self.linear = Linear(LSTM_DIM * 2, len(TAGS))

    def forward(self, input):
        x = self.embedding(input)
        x, _ = self.lstm(x.view(len(input), 1, -1))
        x = self.dropout(x)
        x = self.linear(x.view(len(input), -1))
        logits = functional.log_softmax(x, dim=1)
        return logits


model = NeuralNetwork(vocabulary).to(DEVICE)

print(model)


NeuralNetwork(
  (embedding): Embedding(23625, 256, padding_idx=0)
  (lstm): LSTM(256, 256, num_layers=5, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=512, out_features=9, bias=True)
)


## Model training

In [13]:
from torch.nn import NLLLoss
from torch.optim import Adam

loss_fn = NLLLoss()
optimizer = Adam(model.parameters(), lr=1e-3)

train(DEVICE, model, train_data, valid_data, loss_fn, optimizer, 50)


Epoch 1/50
Train: 945/945 | Eta: 0s | loss: 0.5678 - acc: 0.0948
Valid 215/215 | Eta: 0s | loss: 0.4944 - acc: 0.1907
Epoch 2/50
Train: 945/945 | Eta: 0s | loss: 0.3578 - acc: 0.3415
Valid 215/215 | Eta: 0s | loss: 0.3819 - acc: 0.3548
Epoch 3/50
Train: 945/945 | Eta: 0s | loss: 0.2719 - acc: 0.4926
Valid 215/215 | Eta: 0s | loss: 0.3294 - acc: 0.4521
Epoch 4/50
Train: 945/945 | Eta: 0s | loss: 0.2127 - acc: 0.5921
Valid 215/215 | Eta: 0s | loss: 0.2984 - acc: 0.5050
Epoch 5/50
Train: 945/945 | Eta: 0s | loss: 0.1703 - acc: 0.6663
Valid 215/215 | Eta: 0s | loss: 0.2790 - acc: 0.5396
Epoch 6/50
Train: 945/945 | Eta: 0s | loss: 0.1469 - acc: 0.7102
Valid 215/215 | Eta: 0s | loss: 0.2751 - acc: 0.5570
Epoch 7/50
Train: 945/945 | Eta: 0s | loss: 0.1266 - acc: 0.7487
Valid 215/215 | Eta: 0s | loss: 0.2658 - acc: 0.5784
Epoch 8/50
Train: 945/945 | Eta: 0s | loss: 0.1132 - acc: 0.7756
Valid 215/215 | Eta: 0s | loss: 0.2962 - acc: 0.5832
Epoch 9/50
Train: 945/945 | Eta: 0s | loss: 0.1079 - acc

In [14]:
evaluate_model(DEVICE, model, valid_data, loss_fn)


Valid 215/215 | Eta: 0s | loss: 0.4206 - acc: 0.6771


## Saving the valid output data

In [15]:
def evaluate_model_and_save(device: str, model: Module, data: list[Tensor], dir: str):
    '''Evaluates model with data and saves results in out.tsv file'''
    batch_amount = len(data)
    begin_time = time()

    model.eval()

    with open(path.join(dir, 'out.tsv'), 'w') as out_file:
        with torch.no_grad():
            for index, x in enumerate(data, 1):
                x = x.to(device)

                out = functional.softmax(model(x), dim=1)
                label = torch.argmax(out, dim=1)

                last_label = None
                for j, i in enumerate(label):
                    if j != 0:
                        print(end=' ', file=out_file)
                    value = TAGS_REV[i.item()]
                    if value != "O" and value[0:2] == "I-":
                        if last_label is None or last_label == "O":
                            value = value.replace('I-', 'B-')
                        else:
                            value = "I-" + last_label[2:]
                    last_label = value
                    print('{}'.format(value), file=out_file, end='')
                print(file=out_file)

                elapsed = time() - begin_time

                avarage_time = elapsed / index

                remaining = avarage_time * (batch_amount - index)

                print(f'Progress {index}/{batch_amount} | Eta: {remaining:>0.0f}s', end='\r')
    print()


In [16]:
evaluate_model_and_save(DEVICE, model, [x for (x, _) in valid_data], path.join(BASE_PATH, VALID_DIR))


Progress 215/215 | Eta: 0s


In [17]:
test_data, _ = load_data(path.join(BASE_PATH, TEST_DIR), vocabulary)


In [18]:
evaluate_model_and_save(DEVICE, model, [x for x in test_data], path.join(BASE_PATH, TEST_DIR))


Progress 230/230 | Eta: 0s
