# Распознавание именованных сущностей

# Начало работы

## Загрузка датасета

In [None]:
!pip install datasets
!pip install seqeval

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [None]:
from datasets import load_dataset

dataset = load_dataset("conll2002", "es")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/251k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/237k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8324 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1518 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 8324
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'ner_tags'],
        num_rows: 1518
    })
})

## NER- и POS-теги: подготовительные шаги

Составим словари, сопоставляющие числовую и текстовую метки для каждого из наборов тегов.
Ниже также приведена общая информация касательно ner-тегов со страницы датасета.

In [None]:
from collections import defaultdict

def my_dict_construction(list_of_tags):
  new_dict = defaultdict(dict)
  for i, elem in enumerate(list_of_tags):
    new_dict[i] = elem
  return new_dict

pos_tags_list = ['AO', 'AQ', 'CC', 'CS', 'DA', 'DE', 'DD', 'DI', 'DN', 'DP', 'DT', 'Faa', 'Fat', 'Fc', 'Fd', 'Fe', 'Fg', 'Fh', 'Fia', 'Fit', 'Fp', 'Fpa', 'Fpt', 'Fs', 'Ft', 'Fx', 'Fz', 'I', 'NC', 'NP', 'P0', 'PD', 'PI', 'PN', 'PP', 'PR', 'PT', 'PX', 'RG', 'RN', 'SP', 'VAI', 'VAM', 'VAN', 'VAP', 'VAS', 'VMG', 'VMI', 'VMM', 'VMN', 'VMP', 'VMS', 'VSG', 'VSI', 'VSM', 'VSN', 'VSP', 'VSS', 'Y', 'Z']
ner_tags_list = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
pos_tags_dict, ner_tags_dict = my_dict_construction(pos_tags_list), my_dict_construction(ner_tags_list)

### NER-теги: общая информация

Формат стандартный:
*   **B** обозначает первый элемент фразы
*   **I** — любой неначальный.

Существует четыре типа фраз:

*   имена людей (**PER**),
*   организации (**ORG**),
*   локации (**LOC**)
*   другие имена (**MISC**).

Предполагается, что именованные сущности не рекурсивны и не перекрываются. Если именованная сущночть встроена в другую именованную сущность, обычно отмечается только объект верхнего уровня.

## Чтение данных

In [None]:
import random

def read_NER_data(subset):
    sents = []
    for sentence in subset:
        curr_sent = []
        if sentence["tokens"] == ["-"] or sentence["tokens"] == []:
            continue
        for i, curr_token in enumerate(sentence["tokens"]):
          curr_line = {"word": curr_token , "tag": pos_tags_dict[sentence["pos_tags"][i]], "label": ner_tags_dict[sentence["ner_tags"][i]]}
          curr_sent.append(curr_line)
        sents.append(curr_sent)
    return sents

In [None]:
dataset["test"][0]

{'id': '0',
 'tokens': ['La', 'Coruña', ',', '23', 'may', '(', 'EFECOM', ')', '.'],
 'pos_tags': [4, 28, 13, 59, 28, 21, 29, 22, 20],
 'ner_tags': [5, 6, 0, 0, 0, 0, 3, 0, 0]}

In [None]:
train_sents = read_NER_data(dataset["train"])
dev_sents = read_NER_data(dataset["validation"])
test_sents = read_NER_data(dataset["test"])
for i, elem in enumerate(test_sents[0]):
    print(i, elem)

0 {'word': 'La', 'tag': 'DA', 'label': 'B-LOC'}
1 {'word': 'Coruña', 'tag': 'NC', 'label': 'I-LOC'}
2 {'word': ',', 'tag': 'Fc', 'label': 'O'}
3 {'word': '23', 'tag': 'Z', 'label': 'O'}
4 {'word': 'may', 'tag': 'NC', 'label': 'O'}
5 {'word': '(', 'tag': 'Fpa', 'label': 'O'}
6 {'word': 'EFECOM', 'tag': 'NP', 'label': 'B-ORG'}
7 {'word': ')', 'tag': 'Fpt', 'label': 'O'}
8 {'word': '.', 'tag': 'Fp', 'label': 'O'}


0 {'word': 'En', 'tag': 'ADP', 'label': 'O'}

1 {'word': 'el', 'tag': 'DET', 'label': 'O'}

2 {'word': 'escrito', 'tag': 'ADJ', 'label': 'O'}

# Базовая версия

## Предобработка данных

In [None]:
class Vocabulary:

    def __init__(self, add_begin=True, add_end=True, min_count=1):
        self.add_begin = add_begin
        self.add_end = add_end
        self.min_count = min_count

    def fit(self, data):
        self.symbols_ = ["<PAD>", "<UNK>", "<BEGIN>", "<END>"]
        symbol_counts = defaultdict(int)
        for text in data:
            for letter in set(text):
                symbol_counts[letter] += 1
        self.symbols_ += [letter for letter, count in symbol_counts.items() if count >= self.min_count]
        self.symbol_codes_ = {letter: index for index, letter in enumerate(self.symbols_)}
        return self

    @property
    def unk(self):
        return self.symbol_codes_["<UNK>"]

    @property
    def begin(self):
        return self.symbol_codes_["<BEGIN>"]

    @property
    def end(self):
        return self.symbol_codes_["<END>"]

    def __call__(self, data):
        if isinstance(data, list) and not isinstance(data[0], str):
            return [self.__call__(text) for text in data]
        indexes = [self.symbol_codes_.get(symbol, self.unk) for symbol in data]
        if self.add_begin:
            indexes = [self.begin] + indexes
        if self.add_end:
            indexes = indexes + [self.end]
        return indexes

In [None]:
import torch
from torch.utils.data import Dataset

class SequenceDataset(Dataset):

    def __init__(self, data, vocabs=None, fields=None, vocab_params=None,
                 add_begin=False, add_end=False,
                 device="cuda"):
        vocab_params = vocab_params or dict()
        self.add_begin = add_begin
        self.add_end = add_end

        if vocabs is None:
            if fields is None:
                raise ValueError("You should pass `fields` to train `vocabs` if `vocabs` are not available.")
            vocabs = dict()
            for field in fields:
                curr_vocab_params = vocab_params.get(field, dict())
                curr_vocab_params["add_begin"] = add_begin
                curr_vocab_params["add_end"] = add_end
                vocab = Vocabulary(**curr_vocab_params)
                data_for_vocab = [[elem[field] for elem in sent] for sent in data]
                vocabs[field] = vocab.fit(data_for_vocab)
        self.fields = fields
        self.vocabs = vocabs
        self.data = data
        self.device = device

    def _make_mask(self, item):
        answer = [True for _ in item]
        if self.add_begin:
            answer = [False] + answer
        if self.add_end:
            answer.append(False)
        return answer

    def __getitem__(self, index):
        answer = dict()
        for field, vocab in self.vocabs.items():
            answer_field = self.fields.get(field, field)
            answer[answer_field] = vocab([elem[field] for elem in self.data[index]])
        answer["mask"] = self._make_mask(self.data[index])
        answer = {key: torch.tensor(value, dtype=torch.int64).to(self.device) for key, value in answer.items()}
        answer["index"] = index
        return answer

    def __len__(self):
        return len(self.data)

In [None]:
from typing import Dict

X_train = SequenceDataset(train_sents,
                          fields={"word": "input_ids", "tag": "tags", "label": "labels"},
                          vocab_params={
                            "word": {"min_count": 3},
                            "tag": {"min_count": 3}
                          },
                          add_begin=True
                         )
vocabs: Dict[str, Vocabulary] = X_train.vocabs
for field, vocab in vocabs.items():
    print(field, len(vocab.symbols_))
print(vocabs["label"].symbols_)
print("")

X_dev = SequenceDataset(dev_sents, fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=vocabs, add_begin=True)
X_test = SequenceDataset(test_sents, fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=vocabs, add_begin=True)
for field, elem in X_test[0].items():
    print(field, elem)

word 8684
tag 59
label 13
['<PAD>', '<UNK>', '<BEGIN>', '<END>', 'B-ORG', 'B-LOC', 'O', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

input_ids tensor([   2,   59,  821,    7, 2383,   10,   12,  254,    5,   11],
       device='cuda:0')
tags tensor([ 2, 14,  4,  9, 10,  4,  7,  6,  5,  8], device='cuda:0')
labels tensor([ 2,  5, 11,  6,  6,  6,  6,  4,  6,  6], device='cuda:0')
mask tensor([0, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')
index 0


In [None]:
import numpy as np
from torch.utils.data import DataLoader

def collate_fn(samples, dtype=torch.int64, keys=None):
    if keys is None:
        keys = ["input_ids", "labels", "mask"]
    device = samples[0]["input_ids"].device
    lengths = [elem["input_ids"].shape[0] for elem in samples]
    L = max(elem["input_ids"].shape[0] for elem in samples)

    answer = dict()
    for key in keys:
        answer[key] = torch.stack([
            torch.cat([
                elem[key],
                torch.zeros(size=(L-len(elem[key]),), dtype=dtype).to(device)
            ]) for elem in samples
        ])

    answer["index"] = np.array([elem["index"] for elem in samples])
    return answer

In [None]:
from functools import partial

torch.manual_seed(77)

collate_fn_with_tags = partial(collate_fn, keys=["input_ids", "tags", "labels", "mask"])

train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True, collate_fn=collate_fn_with_tags)
for i, batch in enumerate(train_dataloader):
    if i >= 10:
        break
    for key, value in batch.items():
        print(key, tuple(value.shape), end=" ")
    print("")

input_ids (16, 64) tags (16, 64) labels (16, 64) mask (16, 64) index (16,) 
input_ids (16, 69) tags (16, 69) labels (16, 69) mask (16, 69) index (16,) 
input_ids (16, 64) tags (16, 64) labels (16, 64) mask (16, 64) index (16,) 
input_ids (16, 64) tags (16, 64) labels (16, 64) mask (16, 64) index (16,) 
input_ids (16, 63) tags (16, 63) labels (16, 63) mask (16, 63) index (16,) 
input_ids (16, 65) tags (16, 65) labels (16, 65) mask (16, 65) index (16,) 
input_ids (16, 58) tags (16, 58) labels (16, 58) mask (16, 58) index (16,) 
input_ids (16, 65) tags (16, 65) labels (16, 65) mask (16, 65) index (16,) 
input_ids (16, 65) tags (16, 65) labels (16, 65) mask (16, 65) index (16,) 
input_ids (16, 57) tags (16, 57) labels (16, 57) mask (16, 57) index (16,) 


## MultilayerConvTagger

In [None]:
def make_activation(s):
    s = s.lower()
    if s == "tanh":
        return torch.nn.Tanh()
    elif s == "sigmoid":
        return torch.nn.Sigmoid()
    elif s == "relu":
        return torch.nn.ReLU()
    elif s is None:
        return None
    raise ValueError(f"Неизвестная активация {s}")

In [None]:
import torch.nn as nn

class BasicNeuralTagger(nn.Module):

    def __init__(self, vocab_size, labels_number, lr=0.001,
                 device="cpu", **kwargs):
        super(BasicNeuralTagger, self).__init__()
        self.vocab_size = vocab_size
        self.labels_number = labels_number
        self.build_network(vocab_size, labels_number, **kwargs)
        self.criterion = nn.NLLLoss(reduction="mean")
        self.device = device
        if self.device is not None:
            self.to(self.device)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)

    def build_network(self, **kwargs):
        raise NotImplementedError("You should implement network construction in your derived class.")

    def forward(self, inputs):
        raise NotImplementedError("You should implement forward pass in your derived class.")

    def train_on_batch(self, input_ids, labels, mask=None, **kwargs):
        self.train()
        self.optimizer.zero_grad()
        batch_output = self._validate(input_ids, labels, mask=mask, **kwargs)
        batch_output["loss"].backward()
        self.optimizer.step()
        return batch_output

    def validate_on_batch(self, input_ids, labels, mask=None, **kwargs):
        self.eval()
        with torch.no_grad():
            return self._validate(input_ids, labels, mask=mask, **kwargs)

    def _validate(self, input_ids, labels, mask=None, **kwargs):
        if self.device is not None:
            input_ids, labels = input_ids.to(self.device), labels.to(self.device)
            if mask is not None:
                mask = mask.to(self.device)
        batch_output = self(input_ids, **kwargs)
        if mask is not None:
            labels = torch.where(mask.bool(), labels, -100)
        loss = self.criterion(batch_output["log_probs"].permute(0, 2, 1), labels)
        batch_output["loss"] = loss
        return batch_output

In [None]:
class MultilayerConvTagger(BasicNeuralTagger):

    def build_network(self, vocab_size, labels_number, embeddings_dim=32,
                      n_layers=1, window=5, n_hidden=128, dropout=0.0,
                      use_batch_norm=False, activation="relu"):
        self.n_layers = n_layers
        if isinstance(n_hidden, int):
            n_hidden = (n_hidden,)
        if isinstance(n_hidden, tuple):
            n_hidden = [n_hidden] * self.n_layers
        self.n_hidden = n_hidden
        if isinstance(window, int):
            window = (window,)
        if isinstance(window, tuple):
            window = [window] * self.n_layers
        self.window = window
        self.use_batch_norm = use_batch_norm
        self.embedding = nn.Embedding(vocab_size, embeddings_dim, padding_idx=0)
        self.convolutions = nn.ModuleList()
        for i in range(self.n_layers):
            input_dim = output_dim if i > 0 else embeddings_dim
            convolutions = nn.ModuleList()
            output_dim = 0
            for n_out, width in zip(self.n_hidden[i], self.window[i]):
                convolution = nn.Conv1d(input_dim, n_out, width,
                                        padding=(width-1)//2)
                convolutions.append(convolution)
                output_dim += n_out
            layer = {
                "convolutions": convolutions,
                "activation": make_activation(activation),
                "dropout": nn.Dropout(p=dropout)
            }
            if self.use_batch_norm:
                layer["batch_norm"] = nn.BatchNorm1d(output_dim)
            self.convolutions.append(nn.ModuleDict(layer))
        self.dense = nn.Linear(output_dim, labels_number)
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input_ids, **kwargs):
        if self.device is not None:
            input_ids = input_ids.to(self.device)
        embeddings = self.embedding(input_ids)
        conv_inputs = embeddings.permute([0, 2, 1])
        for layer in self.convolutions:
            conv_outputs_list = []
            for convolution in layer["convolutions"]:
                conv_outputs_list.append(convolution(conv_inputs))
            conv_outputs = torch.cat(conv_outputs_list, dim=1)
            if self.use_batch_norm:
                conv_outputs = layer["batch_norm"](conv_outputs)
            conv_outputs = layer["activation"](conv_outputs)
            conv_outputs = layer["dropout"](conv_outputs)
            conv_inputs = conv_outputs
        conv_outputs = conv_outputs.permute([0, 2, 1])
        logits = self.dense(conv_outputs)
        log_probs = self.log_softmax(logits)
        _, labels = torch.max(log_probs, dim=-1)
        return {"log_probs": log_probs, "labels": labels}

In [None]:
model = MultilayerConvTagger(
            vocab_size=len(vocabs["word"].symbols_),
            labels_number=len(vocabs["label"].symbols_),
            embeddings_dim=200, n_hidden=200,
            n_layers=2, dropout=0.1, device="cuda",
            use_batch_norm=True
)
for name, elem in model.named_parameters():
    print(name, elem.device, elem.shape)

embedding.weight cuda:0 torch.Size([8684, 200])
convolutions.0.convolutions.0.weight cuda:0 torch.Size([200, 200, 5])
convolutions.0.convolutions.0.bias cuda:0 torch.Size([200])
convolutions.0.batch_norm.weight cuda:0 torch.Size([200])
convolutions.0.batch_norm.bias cuda:0 torch.Size([200])
convolutions.1.convolutions.0.weight cuda:0 torch.Size([200, 200, 5])
convolutions.1.convolutions.0.bias cuda:0 torch.Size([200])
convolutions.1.batch_norm.weight cuda:0 torch.Size([200])
convolutions.1.batch_norm.bias cuda:0 torch.Size([200])
dense.weight cuda:0 torch.Size([13, 200])
dense.bias cuda:0 torch.Size([13])


In [None]:
# проверим работу

for i in range(200):
    loss = model.train_on_batch(**batch)["loss"].item()
    if i < 5 or (i+1) % 10 == 0:
        print(i, loss)
print(model.validate_on_batch(**batch)["loss"].item())

0 2.687035083770752
1 1.327937126159668
2 0.8649840950965881
3 0.6684786677360535
4 0.5662751197814941
9 0.30133700370788574
19 0.08800061792135239
29 0.030742526054382324
39 0.015040313825011253
49 0.00956856831908226
59 0.007172673474997282
69 0.006047703791409731
79 0.004836590960621834
89 0.004527945537120104
99 0.003724752925336361
109 0.0031466262880712748
119 0.0027819257229566574
129 0.0024550361558794975
139 0.002481841016560793
149 0.00207214942201972
159 0.0019446939695626497
169 0.0018469032365828753
179 0.0016367083881050348
189 0.0016070399433374405
199 0.0015129883540794253
0.0010726029286161065


### Функции для обучения

> update_metrics, do_epoch, predict_with_model


In [None]:
from tqdm.auto import tqdm

def update_metrics(metrics, batch_output, batch_labels, mask=None):
    n_batches = metrics["n_batches"]
    metrics["loss"] = (metrics["loss"] * n_batches + batch_output["loss"].item()) / (n_batches + 1)
    metrics["n_batches"] += 1
    if mask is not None:
        mask = mask.cpu().numpy().astype("int")
    else:
        mask = (batch_labels != 0).cpu().numpy().astype("int")
    are_equal = (batch_output["labels"] == batch_labels).cpu().numpy().astype("int")
    curr_correct = (are_equal * mask).sum()
    curr_total = mask.sum()
    metrics["correct"] += (are_equal * mask).sum()
    metrics["total"] += mask.sum()
    are_seq_correct = np.min(np.maximum(are_equal, 1-mask), axis=1)
    metrics["sent_correct"] += are_seq_correct.sum()
    metrics["sent_total"] += mask.shape[0]
    metrics["accuracy"] = metrics["correct"] / max(metrics["total"], 1)
    metrics["sent_accuracy"] = metrics["sent_correct"] / max(metrics["sent_total"], 1)

def do_epoch(model, dataloader, mode="validate", epoch=1):
    metrics = {"correct": 0, "total": 0, "sent_correct": 0, "sent_total": 0, "loss": 0.0, "n_batches": 0}
    func = model.train_on_batch if mode == "train" else model.validate_on_batch
    progress_bar = tqdm(dataloader, leave=True)
    progress_bar.set_description(f"{mode}, epoch={epoch}")
    for i, batch in enumerate(progress_bar):
        batch_output = func(**batch)
        update_metrics(metrics, batch_output, batch["labels"], mask=batch["mask"])
        progress_bar.set_postfix({"loss": round(metrics["loss"], 4), "acc": round(100 * metrics["accuracy"], 2),
                                  "sent_acc": round(100 * metrics["sent_accuracy"], 2)})
    return metrics

def predict_with_model(model, X: SequenceDataset, batch_size=32):
    model.eval()
    dataloader = DataLoader(X, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    answer = [None] * len(X)
    for batch in dataloader:
        with torch.no_grad():
            batch_answer = model(**batch)
        labels = batch_answer["labels"].cpu().numpy()
        # probs = batch_answer.cpu().numpy()
        # labels = probs.argmax(axis=-1)
        for index, curr_labels, curr_mask in zip(batch["index"], labels, batch["mask"].bool().cpu().numpy()):
            answer[index] = np.take(X.vocabs["label"].symbols_, curr_labels[curr_mask])
    return answer

## Обучение модели

In [None]:
train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True, collate_fn=collate_fn_with_tags)
dev_dataloader = DataLoader(X_dev, batch_size=16, shuffle=False, collate_fn=collate_fn_with_tags)
test_dataloader = DataLoader(X_test, batch_size=16, shuffle=False, collate_fn=collate_fn_with_tags)
NEPOCHS = 10

model = MultilayerConvTagger(
            vocab_size=len(vocabs["word"].symbols_),
            labels_number=len(vocabs["label"].symbols_),
            embeddings_dim=200, n_hidden=200,
            n_layers=2, dropout=0.1, device="cuda",
            use_batch_norm=True
)
best_val_acc = 0.0
checkpoint = "checkpoint_best.pt"
for epoch in range(NEPOCHS):
    do_epoch(model, train_dataloader, mode="train", epoch=epoch+1)
    epoch_metrics = do_epoch(model, dev_dataloader, mode="validate", epoch=epoch+1)
    if epoch_metrics["accuracy"] > best_val_acc:
        best_val_acc = epoch_metrics["accuracy"]
        torch.save(model.state_dict(), checkpoint)

model.load_state_dict(torch.load(checkpoint))
do_epoch(model, dev_dataloader, mode="validate", epoch="evaluate")
do_epoch(model, test_dataloader, mode="validate", epoch="evaluate")

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

{'correct': 49151,
 'total': 51371,
 'sent_correct': 576,
 'sent_total': 1355,
 'loss': 0.17098603303379872,
 'n_batches': 85,
 'accuracy': 0.9567849564929629,
 'sent_accuracy': 0.42509225092250924}

In [None]:
predictions = predict_with_model(model, X_test)
for elem, label in zip(test_sents[50], predictions[50]):
    print(elem['word'], elem['label'], label)
print(predictions[50])

La O O
Organización B-ORG B-ORG
Mundial I-ORG I-ORG
del I-ORG I-ORG
Comercio I-ORG I-ORG
( O O
OMC B-ORG B-ORG
) O O
dio O O
hoy, O O
martes O O
, O O
a O O
Chile B-LOC B-LOC
de O O
plazo O O
hasta O O
el O O
21 O O
de O O
marzo O O
de O O
2001 O O
para O O
modificar O O
su O O
legislación O O
sobre O O
impuestos O O
a O O
las O O
bebidas O O
alcohólicas O O
que O O
fue O O
denunciada O O
por O O
la O O
Unión B-ORG B-ORG
Europea I-ORG I-ORG
y O O
declarada O O
ilegal O O
. O O
['O' 'B-ORG' 'I-ORG' 'I-ORG' 'I-ORG' 'O' 'B-ORG' 'O' 'O' 'O' 'O' 'O' 'O'
 'B-LOC' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'B-ORG' 'I-ORG' 'O' 'O' 'O' 'O']


In [None]:
from datasets import load_metric

metric = load_metric('seqeval')
corr_labels = [[elem['label'] for elem in sent] for sent in test_sents]
results = metric.compute(references=corr_labels, predictions=predictions)
for key, value in results.items():
    print(key, value)

  metric = load_metric('seqeval')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


LOC {'precision': 0.7172827172827173, 'recall': 0.6623616236162362, 'f1': 0.6887290167865708, 'number': 1084}
MISC {'precision': 0.3567073170731707, 'recall': 0.34411764705882353, 'f1': 0.3502994011976048, 'number': 340}
ORG {'precision': 0.6953592814371258, 'recall': 0.6635714285714286, 'f1': 0.679093567251462, 'number': 1400}
PER {'precision': 0.6922060766182299, 'recall': 0.7129251700680272, 'f1': 0.7024128686327078, 'number': 735}
overall_precision 0.668614845119813
overall_recall 0.6428772127001967
overall_f1 0.6554934823091249
overall_accuracy 0.9567849564929629


In [None]:
words = "Tony Blair es el Primer Ministro de Gran Bretaña y vive en Londres.".split()
# words = "En el Festival de Cine de Venecia de este año se presentará una película basada en el libro del famoso escritor americano Jack London \"Martin Eden\".".split()
# words = "Después de la llegada al poder de Napoleón I y la proclamación del Primer Imperio, la Marsellesa perdió su estatus como himno nacional de Francia .".split()
# words = "Elon Musk es el CEO de Tesla y SpaceX .".split()
# words = "Los bancos regionales de EE. UU. siguen bajo presión un año después del colapso de Silicon Valley Bank .".split()

sent = [{"word": word, "tag": "", "label": ""} for word in words]
X_sent = SequenceDataset([sent], fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=X_train.vocabs, add_begin=X_train.add_begin)
y_sent_pred = predict_with_model(model, X_sent)
for word, label in zip(words, y_sent_pred[0]):
    print(word, label)

Tony O
Blair O
es O
el O
Primer O
Ministro I-ORG
de O
Gran B-LOC
Bretaña I-LOC
y O
vive O
en O
Londres. O


## Рекуррентная сеть

In [None]:
class MultilayerRNNTagger(BasicNeuralTagger):

    def build_network(self, vocab_size, labels_number, embeddings_dim=32,
                      n_layers=1, n_hidden=128, dropout=0.0):
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.embedding = nn.Embedding(vocab_size, embeddings_dim, padding_idx=0)
        self.rnn = torch.nn.GRU(embeddings_dim, self.n_hidden, self.n_layers,
                                batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)
        self.dense = nn.Linear(2*self.n_hidden, labels_number)
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input_ids, **kwargs):
        if self.device is not None:
            input_ids = input_ids.to(self.device)
        embeddings = self.embedding(input_ids)
        rnn_outputs, rnn_state = self.rnn(embeddings)
        rnn_outputs = self.dropout(rnn_outputs)
        logits = self.dense(rnn_outputs)
        log_probs = self.log_softmax(logits)
        _, labels = torch.max(log_probs, dim=-1)
        return {"log_probs": log_probs, "labels": labels}

In [None]:
model = MultilayerRNNTagger(
            vocab_size=len(vocabs["word"].symbols_),
            labels_number=len(vocabs["label"].symbols_),
            embeddings_dim=300, n_hidden=128,
            n_layers=3, dropout=0.1, device="cuda"
)
for name, elem in model.named_parameters():
    print(name, elem.device, elem.shape)

embedding.weight cuda:0 torch.Size([8684, 300])
rnn.weight_ih_l0 cuda:0 torch.Size([384, 300])
rnn.weight_hh_l0 cuda:0 torch.Size([384, 128])
rnn.bias_ih_l0 cuda:0 torch.Size([384])
rnn.bias_hh_l0 cuda:0 torch.Size([384])
rnn.weight_ih_l0_reverse cuda:0 torch.Size([384, 300])
rnn.weight_hh_l0_reverse cuda:0 torch.Size([384, 128])
rnn.bias_ih_l0_reverse cuda:0 torch.Size([384])
rnn.bias_hh_l0_reverse cuda:0 torch.Size([384])
rnn.weight_ih_l1 cuda:0 torch.Size([384, 256])
rnn.weight_hh_l1 cuda:0 torch.Size([384, 128])
rnn.bias_ih_l1 cuda:0 torch.Size([384])
rnn.bias_hh_l1 cuda:0 torch.Size([384])
rnn.weight_ih_l1_reverse cuda:0 torch.Size([384, 256])
rnn.weight_hh_l1_reverse cuda:0 torch.Size([384, 128])
rnn.bias_ih_l1_reverse cuda:0 torch.Size([384])
rnn.bias_hh_l1_reverse cuda:0 torch.Size([384])
rnn.weight_ih_l2 cuda:0 torch.Size([384, 256])
rnn.weight_hh_l2 cuda:0 torch.Size([384, 128])
rnn.bias_ih_l2 cuda:0 torch.Size([384])
rnn.bias_hh_l2 cuda:0 torch.Size([384])
rnn.weight_ih_l2_r

In [None]:
for i in range(50):
    loss = model.train_on_batch(**batch)["loss"].item()
    if i < 5 or (i+1) % 10 == 0:
        print(i, loss)
print(model.validate_on_batch(**batch)["loss"].item())

0 2.5853281021118164
1 2.058493137359619
2 1.5331416130065918
3 1.0337775945663452
4 0.7967306971549988
9 0.8256314396858215
19 0.5500664710998535
29 0.4121440351009369
39 0.25058653950691223
49 0.12409147620201111
0.11037540435791016


In [None]:
train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True, collate_fn=collate_fn_with_tags)
dev_dataloader = DataLoader(X_dev, batch_size=16, shuffle=False, collate_fn=collate_fn_with_tags)
test_dataloader = DataLoader(X_test, batch_size=16, shuffle=False, collate_fn=collate_fn_with_tags)
NEPOCHS = 10

model = MultilayerRNNTagger(vocab_size=len(vocabs["word"].symbols_),
                            labels_number=len(vocabs["label"].symbols_),
                            embeddings_dim=300,
                            n_layers=3, dropout=0.1,
                            n_hidden=256,
                            device="cuda")
best_val_acc = 0.0
checkpoint = "checkpoint_best.pt"
for epoch in range(NEPOCHS):
    do_epoch(model, train_dataloader, mode="train", epoch=epoch+1)
    epoch_metrics = do_epoch(model, dev_dataloader, mode="validate", epoch=epoch+1)
    if epoch_metrics["accuracy"] > best_val_acc:
        best_val_acc = epoch_metrics["accuracy"]
        torch.save(model.state_dict(), checkpoint)

model.load_state_dict(torch.load(checkpoint))
do_epoch(model, dev_dataloader, mode="validate", epoch="evaluate")
do_epoch(model, test_dataloader, mode="validate", epoch="evaluate")

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

{'correct': 49422,
 'total': 51371,
 'sent_correct': 638,
 'sent_total': 1355,
 'loss': 0.16065171588869656,
 'n_batches': 85,
 'accuracy': 0.9620603063985517,
 'sent_accuracy': 0.4708487084870849}

In [None]:
from datasets import load_metric

metric = load_metric('seqeval')
corr_labels = [[elem['label'] for elem in sent] for sent in test_sents]
predictions = predict_with_model(model, X_test)
results = metric.compute(references=corr_labels, predictions=predictions)
for key, value in results.items():
    print(key, value)

  metric = load_metric('seqeval')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

LOC {'precision': 0.7780104712041885, 'recall': 0.6854243542435424, 'f1': 0.7287886218734674, 'number': 1084}
MISC {'precision': 0.45302013422818793, 'recall': 0.39705882352941174, 'f1': 0.4231974921630094, 'number': 340}
ORG {'precision': 0.7655763239875389, 'recall': 0.7021428571428572, 'f1': 0.7324888226527572, 'number': 1400}
PER {'precision': 0.7589041095890411, 'recall': 0.7537414965986394, 'f1': 0.7563139931740614, 'number': 735}
overall_precision 0.7392102846648301
overall_recall 0.6785613936499016
overall_f1 0.7075886317023147
overall_accuracy 0.9619824414552958


In [None]:
# words = "Tony Blair es el Primer Ministro de Gran Bretaña y vive en Londres.".split()
# words = "En el Festival de Cine de Venecia de este año se presentará una película basada en el libro del famoso escritor americano Jack London \"Martin Eden\".".split()
# words = "Después de la llegada al poder de Napoleón I y la proclamación del Primer Imperio, la Marsellesa perdió su estatus como himno nacional de Francia .".split()
# words = "Elon Musk es el CEO de Tesla y SpaceX .".split()
words = "Los bancos regionales de EE. UU. siguen bajo presión un año después del colapso de Silicon Valley Bank .".split()

sent = [{"word": word, "tag": "", "label": ""} for word in words]
X_sent = SequenceDataset([sent], fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=X_train.vocabs, add_begin=X_train.add_begin)
y_sent_pred = predict_with_model(model, X_sent)
for word, label in zip(words, y_sent_pred[0]):
    print(word, label)

Los O
bancos O
regionales O
de O
EE. O
UU. O
siguen O
bajo O
presión O
un O
año O
después O
del O
colapso O
de O
Silicon O
Valley I-MISC
Bank O
. O


# Улучшенная версия

## Добавление тегов

Для начала попробуем модель с добавлением частеречных тегов.

In [None]:
class MultilayerRNNTaggerWithTags(BasicNeuralTagger):

    def build_network(self, vocab_size, labels_number, tag_vocab_size,
                      embeddings_dim=32, tag_embeddings_dim=32,
                      n_layers=1, n_hidden=128, dropout=0.0):
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.embedding = nn.Embedding(vocab_size, embeddings_dim, padding_idx=0)
        self.tag_embedding = nn.Embedding(tag_vocab_size, tag_embeddings_dim, padding_idx=0)
        self.rnn = torch.nn.GRU(embeddings_dim+tag_embeddings_dim, self.n_hidden, self.n_layers,
                                batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)
        self.dense = nn.Linear(2*self.n_hidden, labels_number)
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input_ids, tags, **kwargs):
        if self.device is not None:
            input_ids = input_ids.to(self.device)
            tags = tags.to(self.device)
        embeddings = self.embedding(input_ids)
        tag_embeddings = self.tag_embedding(tags)
        all_embeddings = torch.cat([embeddings, tag_embeddings], dim=2)
        rnn_outputs, rnn_state = self.rnn(all_embeddings)
        rnn_outputs = self.dropout(rnn_outputs)
        logits = self.dense(rnn_outputs)
        log_probs = self.log_softmax(logits)
        _, labels = torch.max(log_probs, dim=-1)
        return {"log_probs": log_probs, "labels": labels}

In [None]:
model = MultilayerRNNTaggerWithTags(
            vocab_size=len(vocabs["word"].symbols_),
            labels_number=len(vocabs["label"].symbols_),
            tag_vocab_size=len(vocabs["tag"].symbols_),
            embeddings_dim=300, n_hidden=128, tag_embeddings_dim=100,
            n_layers=2, dropout=0.1, device="cuda"
)
for name, elem in model.named_parameters():
    print(name, elem.device, elem.shape)

embedding.weight cuda:0 torch.Size([8684, 300])
tag_embedding.weight cuda:0 torch.Size([59, 100])
rnn.weight_ih_l0 cuda:0 torch.Size([384, 400])
rnn.weight_hh_l0 cuda:0 torch.Size([384, 128])
rnn.bias_ih_l0 cuda:0 torch.Size([384])
rnn.bias_hh_l0 cuda:0 torch.Size([384])
rnn.weight_ih_l0_reverse cuda:0 torch.Size([384, 400])
rnn.weight_hh_l0_reverse cuda:0 torch.Size([384, 128])
rnn.bias_ih_l0_reverse cuda:0 torch.Size([384])
rnn.bias_hh_l0_reverse cuda:0 torch.Size([384])
rnn.weight_ih_l1 cuda:0 torch.Size([384, 256])
rnn.weight_hh_l1 cuda:0 torch.Size([384, 128])
rnn.bias_ih_l1 cuda:0 torch.Size([384])
rnn.bias_hh_l1 cuda:0 torch.Size([384])
rnn.weight_ih_l1_reverse cuda:0 torch.Size([384, 256])
rnn.weight_hh_l1_reverse cuda:0 torch.Size([384, 128])
rnn.bias_ih_l1_reverse cuda:0 torch.Size([384])
rnn.bias_hh_l1_reverse cuda:0 torch.Size([384])
dense.weight cuda:0 torch.Size([13, 256])
dense.bias cuda:0 torch.Size([13])


In [None]:
for i in range(50):
    loss = model.train_on_batch(**batch)["loss"].item()
    if i < 5 or (i+1) % 10 == 0:
        print(i, loss)
print(model.validate_on_batch(**batch)["loss"].item())

0 2.581226348876953
1 2.043376922607422
2 1.532670497894287
3 1.0817571878433228
4 0.8224672079086304
9 0.8016092777252197
19 0.5088038444519043
29 0.2927111089229584
39 0.13878215849399567
49 0.06240543723106384
0.05438442528247833


In [None]:
train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True, collate_fn=collate_fn_with_tags)
dev_dataloader = DataLoader(X_dev, batch_size=16, shuffle=False, collate_fn=collate_fn_with_tags)
test_dataloader = DataLoader(X_test, batch_size=16, shuffle=False, collate_fn=collate_fn_with_tags)
NEPOCHS = 10

model = MultilayerRNNTaggerWithTags(
                            vocab_size=len(vocabs["word"].symbols_),
                            labels_number=len(vocabs["label"].symbols_),
                            tag_vocab_size=len(vocabs["tag"].symbols_),
                            embeddings_dim=200, tag_embeddings_dim=100,
                            n_layers=1, dropout=0.1,
                            n_hidden=192,
                            device="cuda")
best_val_acc = 0.0
checkpoint = "checkpoint_best.pt"
for epoch in range(NEPOCHS):
    do_epoch(model, train_dataloader, mode="train", epoch=epoch+1)
    epoch_metrics = do_epoch(model, dev_dataloader, mode="validate", epoch=epoch+1)
    if epoch_metrics["accuracy"] > best_val_acc:
        best_val_acc = epoch_metrics["accuracy"]
        torch.save(model.state_dict(), checkpoint)

model.load_state_dict(torch.load(checkpoint))
do_epoch(model, dev_dataloader, mode="validate", epoch="evaluate")
do_epoch(model, test_dataloader, mode="validate", epoch="evaluate")



  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

{'correct': 49420,
 'total': 51371,
 'sent_correct': 663,
 'sent_total': 1355,
 'loss': 0.1693711639765431,
 'n_batches': 85,
 'accuracy': 0.9620213739269238,
 'sent_accuracy': 0.48929889298892987}

In [None]:
def predict_with_model_with_tags(model, X: SequenceDataset, batch_size=32):
    model.eval()
    dataloader = DataLoader(X, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_with_tags)
    answer = [None] * len(X)
    for batch in dataloader:
        with torch.no_grad():
            batch_answer = model(**batch)
        labels = batch_answer["labels"].cpu().numpy()
        # probs = batch_answer.cpu().numpy()
        # labels = probs.argmax(axis=-1)
        for index, curr_labels, curr_mask in zip(batch["index"], labels, batch["mask"].bool().cpu().numpy()):
            answer[index] = np.take(X.vocabs["label"].symbols_, curr_labels[curr_mask])
    return answer

In [None]:
metric = load_metric('seqeval')
corr_labels = [[elem['label'] for elem in sent] for sent in test_sents]
predictions = predict_with_model_with_tags(model, X_test)
results = metric.compute(references=corr_labels, predictions=predictions)
for key, value in results.items():
    print(key, value)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


LOC {'precision': 0.7893544733861835, 'recall': 0.6429889298892989, 'f1': 0.7086934417895273, 'number': 1084}
MISC {'precision': 0.4391891891891892, 'recall': 0.38235294117647056, 'f1': 0.4088050314465408, 'number': 340}
ORG {'precision': 0.7064093728463129, 'recall': 0.7321428571428571, 'f1': 0.7190459487898981, 'number': 1400}
PER {'precision': 0.7703225806451612, 'recall': 0.8122448979591836, 'f1': 0.790728476821192, 'number': 735}
overall_precision 0.7192364170337738
overall_recall 0.6881146389435234
overall_f1 0.7033314187248707
overall_accuracy 0.9620213739269238


In [None]:
# words = "Tony Blair es el Primer Ministro de Gran Bretaña y vive en Londres.".split()
# words = "En el Festival de Cine de Venecia de este año se presentará una película basada en el libro del famoso escritor americano Jack London \"Martin Eden\".".split()
words = "Después de la llegada al poder de Napoleón I y la proclamación del Primer Imperio, la Marsellesa perdió su estatus como himno nacional de Francia .".split()
# words = "Elon Musk es el CEO de Tesla y SpaceX .".split()
# words = "Los bancos regionales de EE. UU. siguen bajo presión un año después del colapso de Silicon Valley Bank .".split()

sent = [{"word": word, "tag": "", "label": ""} for word in words]
X_sent = SequenceDataset([sent], fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=X_train.vocabs, add_begin=X_train.add_begin)
y_sent_pred = predict_with_model_with_tags(model, X_sent)
for word, label in zip(words, y_sent_pred[0]):
    print(word, label)

Después O
de O
la O
llegada O
al O
poder O
de O
Napoleón O
I O
y O
la O
proclamación O
del O
Primer O
Imperio, O
la O
Marsellesa O
perdió O
su O
estatus O
como O
himno O
nacional O
de O
Francia B-LOC
. O


## Посимвольная сеть

Качество у модели с тегами незначительно отличается от результатов рекуррентной сети, поэтому для улучшения качества используем посимвольную сеть. Вероятно, информация о заглавных буквах принесёт больший прирост качества, чем информация о частях речи.

In [None]:
class SymbolVocabulary:

    def __init__(self, add_begin=True, add_end=True, min_count=1):
        self.add_begin = add_begin
        self.add_end = add_end
        self.min_count = min_count

    def fit(self, data):
        self.symbols_ = ["<PAD>", "<UNK>", "<BEGIN>", "<END>"]
        symbol_counts = defaultdict(int)
        for text in data:
            for word in text:
                for letter in set(word):
                    symbol_counts[letter] += 1
        self.symbols_ += [letter for letter, count in symbol_counts.items() if count >= self.min_count]
        self.symbol_codes_ = {letter: index for index, letter in enumerate(self.symbols_)}
        return self

    @property
    def unk(self):
        return self.symbol_codes_["<UNK>"]

    @property
    def begin(self):
        return self.symbol_codes_["<BEGIN>"]

    @property
    def end(self):
        return self.symbol_codes_["<END>"]

    def __call__(self, data):
        if isinstance(data, list) and not isinstance(data[0], str):
            return [self.__call__(text) for text in data]
        # data --- это отдельный текст
        indexes = [[self.symbol_codes_.get(symbol, self.unk) for symbol in word] for word in data]
        if self.add_begin:
            indexes = [[self.begin]] + indexes
        if self.add_end:
            indexes = indexes + [[self.end]]
        return indexes

In [None]:
class SequenceDataset(Dataset):

    def __init__(self, data, vocabs=None, fields=None,
                 symbol_field="word", max_word_length=16,
                 vocab_params=None, add_begin=False, add_end=False,
                 device="cuda"):

        vocab_params = vocab_params or dict()
        self.add_begin = add_begin
        self.add_end = add_end
        self.fields = fields
        self.symbol_field = symbol_field
        self.max_word_length = max_word_length
        # создаём словари
        if vocabs is None:
            if fields is None:
                raise ValueError("You should pass `fields` to train `vocabs` if `vocabs` are not available.")
            vocabs = dict()
            for field in fields:
                curr_vocab_params = vocab_params.get(field, dict())
                curr_vocab_params["add_begin"] = add_begin
                curr_vocab_params["add_end"] = add_end
                cls = SymbolVocabulary if field == self.symbol_field else Vocabulary
                vocab = cls(**curr_vocab_params)
                data_for_vocab = [[elem[field] for elem in sent] for sent in data]
                vocabs[field] = vocab.fit(data_for_vocab)
        self.vocabs = vocabs
        self.data = data
        self.device = device

    def _make_mask(self, item):
        answer = [True for _ in item]
        if self.add_begin:
            answer = [False] + answer
        if self.add_end:
            answer.append(False)
        return answer

    def __getitem__(self, index):
        answer = dict()
        for field, vocab in self.vocabs.items():
            answer_field = self.fields.get(field, field)
            curr_answer = vocab([elem[field] for elem in self.data[index]])
            if field == self.symbol_field:
                curr_answer = [
                    elem[:self.max_word_length] if len(elem) >= self.max_word_length else
                    elem + [0] * (self.max_word_length-len(elem))
                    for elem in curr_answer
                ]
            answer[answer_field] = curr_answer

        answer["mask"] = self._make_mask(self.data[index])
        answer = {key: torch.tensor(value, dtype=torch.int64).to(self.device) for key, value in answer.items()}
        answer["index"] = index
        return answer

    def __len__(self):
        return len(self.data)

In [None]:
dataset_params = {
    "fields": {"word": "input_ids", "tag": "tags", "label": "labels"},
    "symbol_field": "word", "max_word_length": 16
}
X_train = SequenceDataset(train_sents,
                          vocab_params={
                            "word": {"min_count": 10},
                            "tag": {"min_count": 3}
                          },
                          **dataset_params
                         )
vocabs: Dict[str, Vocabulary] = X_train.vocabs
for field, vocab in vocabs.items():
    print(field, len(vocab.symbols_))
print(vocabs["label"].symbols_)
print("")

X_dev = SequenceDataset(dev_sents, vocabs=vocabs, **dataset_params)
X_test = SequenceDataset(test_sents, vocabs=vocabs, **dataset_params)
for field, elem in X_test[4].items():
    print(field, elem)

word 89
tag 59
label 13
['<PAD>', '<UNK>', '<BEGIN>', '<END>', 'B-ORG', 'B-LOC', 'O', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

input_ids tensor([[13, 10, 41, 46, 17,  5,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [13, 46, 15,  5, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [20, 74,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [23, 17, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [12,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [25, 24, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [26,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
       device='cuda:0')
tags ten

In [None]:
import numpy as np
from torch.utils.data import DataLoader

def collate_fn(samples, dtype=torch.int64, keys=None):
    if keys is None:
        keys = ["input_ids", "labels", "mask"]
    device = samples[0]["input_ids"].device
    lengths = [elem["input_ids"].shape[0] for elem in samples]
    L = max(elem["input_ids"].shape[0] for elem in samples)

    answer = dict()
    for key in keys:
        answer[key] = torch.stack([
            torch.cat([
                elem[key],
                torch.zeros(size=((L-len(elem[key]),)+elem[key].shape[1:]), dtype=dtype).to(device)
            ]) for elem in samples
        ])
    answer["index"] = np.array([elem["index"] for elem in samples])
    return answer

In [None]:
from functools import partial

torch.manual_seed(43)

collate_fn_with_tags = partial(collate_fn, keys=["input_ids", "tags", "labels", "mask"])

train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True, collate_fn=collate_fn_with_tags)
# dev_dataloader = DataLoader(X_dev, batch_size=16, shuffle=False, collate_fn=collate_fn)
# test_dataloader = DataLoader(X_test, batch_size=16, shuffle=False, collate_fn=collate_fn)
for i, batch in enumerate(train_dataloader):
    if i >= 10:
        break
    for key, value in batch.items():
        print(key, tuple(value.shape), end=" ")
    print("")

input_ids (16, 78, 16) tags (16, 78) labels (16, 78) mask (16, 78) index (16,) 
input_ids (16, 58, 16) tags (16, 58) labels (16, 58) mask (16, 58) index (16,) 
input_ids (16, 93, 16) tags (16, 93) labels (16, 93) mask (16, 93) index (16,) 
input_ids (16, 55, 16) tags (16, 55) labels (16, 55) mask (16, 55) index (16,) 
input_ids (16, 51, 16) tags (16, 51) labels (16, 51) mask (16, 51) index (16,) 
input_ids (16, 73, 16) tags (16, 73) labels (16, 73) mask (16, 73) index (16,) 
input_ids (16, 79, 16) tags (16, 79) labels (16, 79) mask (16, 79) index (16,) 
input_ids (16, 65, 16) tags (16, 65) labels (16, 65) mask (16, 65) index (16,) 
input_ids (16, 69, 16) tags (16, 69) labels (16, 69) mask (16, 69) index (16,) 
input_ids (16, 68, 16) tags (16, 68) labels (16, 68) mask (16, 68) index (16,) 


In [None]:
import torch.nn as nn

class ConvolutionalNetwork(nn.Module):

    def __init__(self, n_layers=1, n_input=32, n_hidden=128, window=5,
                 dropout=0.0, use_batch_norm=True, activation="relu",
                 aggregation="max", device="cuda"):
        super(ConvolutionalNetwork, self).__init__()
        self.n_layers = n_layers
        if isinstance(n_hidden, int):
            n_hidden = (n_hidden,) # кортеж размерности 1
        if isinstance(n_hidden, tuple):
            n_hidden = [n_hidden] * self.n_layers
        self.n_hidden = n_hidden
        if isinstance(window, int):
            window = (window,)
        if isinstance(window, tuple):
            window = [window] * self.n_layers
        self.window = window
        self.use_batch_norm = use_batch_norm
        # может быть несколько слоёв свёрток
        self.convolutions = nn.ModuleList()
        for i in range(self.n_layers):
            input_dim = output_dim if i > 0 else n_input # входная размерность
            # convolution = nn.Conv1d(input_dim, self.n_hidden[i], self.window[i],
            #                         padding=(self.window[i]-1)//2)
            convolutions = nn.ModuleList()
            output_dim = 0
            for n_out, width in zip(self.n_hidden[i], self.window[i]):
                convolution = nn.Conv1d(input_dim, n_out, width,
                                        padding=(width-1)//2)
                convolutions.append(convolution)
                output_dim += n_out
            layer = {
                "convolutions": convolutions,
                "activation": make_activation(activation),
                "dropout": nn.Dropout(p=dropout)
            }
            if self.use_batch_norm:
                layer["batch_norm"] = nn.BatchNorm1d(output_dim)
            self.convolutions.append(nn.ModuleDict(layer))
        self.aggregation = aggregation

    def forward(self, inputs):
        # для свёрточного слоя нужно сделать второй размерностью число каналов
        conv_inputs = inputs.permute([0, 2, 1]) # B * d_emb * L
        for layer in self.convolutions:
            # conv_outputs = layer["convolution"](conv_inputs) # B * h_out * L
            conv_outputs_list = []
            for convolution in layer["convolutions"]:
                conv_outputs_list.append(convolution(conv_inputs))
            conv_outputs = torch.cat(conv_outputs_list, dim=1)
            if self.use_batch_norm:
                conv_outputs = layer["batch_norm"](conv_outputs)
            conv_outputs = layer["activation"](conv_outputs)
            conv_outputs = layer["dropout"](conv_outputs)
            conv_inputs = conv_outputs
        conv_outputs = conv_outputs.permute([0, 2, 1])
        if self.aggregation == "max":
            final_output, _ = torch.max(conv_outputs, dim=1)
        else:
            final_output = torch.mean(conv_outputs, dim=1)
        return final_output

In [None]:
class MultilayerSymbolRNNTagger(BasicNeuralTagger):

    def build_network(self, vocab_size, labels_number,
                      symbol_embeddings_dim=32, word_embeddings_dim=128,
                      symbol_network_layers=1, symbol_network_window=5,
                      symbol_network_dropout=0.1, symbol_network_aggregation="max",
                      n_layers=1, n_hidden=128, dropout=0.0):
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.embedding = nn.Embedding(vocab_size, symbol_embeddings_dim, padding_idx=0)
        self.symbol_network = ConvolutionalNetwork(n_layers=symbol_network_layers,
                                                   n_input=symbol_embeddings_dim,
                                                   n_hidden=word_embeddings_dim,
                                                   window=symbol_network_window,
                                                   dropout=symbol_network_dropout,
                                                   aggregation=symbol_network_aggregation
                                                  )
        # может быть несколько рекуррентных слоёв
        self.rnn = torch.nn.GRU(word_embeddings_dim, self.n_hidden, self.n_layers,
                                batch_first=True, bidirectional=True, dropout=dropout)
        self.dropout = torch.nn.Dropout(dropout)
        self.dense = nn.Linear(2*self.n_hidden, labels_number)
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, input_ids, **kwargs):
        if self.device is not None:
            input_ids = input_ids.to(self.device)
        symbol_embeddings = self.embedding(input_ids) # [B, L, w, d]
        symbol_embeddings_reshaped = symbol_embeddings.reshape((-1,)+tuple(symbol_embeddings.shape[-2:])) # [(B * L), w, d]
        word_embeddings_reshaped = self.symbol_network(symbol_embeddings_reshaped) # [B*L, D]
        word_embeddings = word_embeddings_reshaped.reshape(tuple(symbol_embeddings.shape[:2])+(-1,)) # [B, L, D]
        rnn_outputs, rnn_state = self.rnn(word_embeddings)
        rnn_outputs = self.dropout(rnn_outputs)
        # финальный слой
        logits = self.dense(rnn_outputs)
        log_probs = self.log_softmax(logits)
        _, labels = torch.max(log_probs, dim=-1)
        return {"log_probs": log_probs, "labels": labels}

In [None]:
model = MultilayerSymbolRNNTagger(
            vocab_size=len(vocabs["word"].symbols_),
            labels_number=len(vocabs["label"].symbols_),
            word_embeddings_dim=192,
            device="cuda"
)
for name, elem in model.named_parameters():
    print(name, elem.device, elem.shape)

embedding.weight cuda:0 torch.Size([89, 32])
symbol_network.convolutions.0.convolutions.0.weight cuda:0 torch.Size([192, 32, 5])
symbol_network.convolutions.0.convolutions.0.bias cuda:0 torch.Size([192])
symbol_network.convolutions.0.batch_norm.weight cuda:0 torch.Size([192])
symbol_network.convolutions.0.batch_norm.bias cuda:0 torch.Size([192])
rnn.weight_ih_l0 cuda:0 torch.Size([384, 192])
rnn.weight_hh_l0 cuda:0 torch.Size([384, 128])
rnn.bias_ih_l0 cuda:0 torch.Size([384])
rnn.bias_hh_l0 cuda:0 torch.Size([384])
rnn.weight_ih_l0_reverse cuda:0 torch.Size([384, 192])
rnn.weight_hh_l0_reverse cuda:0 torch.Size([384, 128])
rnn.bias_ih_l0_reverse cuda:0 torch.Size([384])
rnn.bias_hh_l0_reverse cuda:0 torch.Size([384])
dense.weight cuda:0 torch.Size([13, 256])
dense.bias cuda:0 torch.Size([13])


In [None]:
for i in range(200):
    loss = model.train_on_batch(**batch)["loss"].item()
    if i < 5 or (i+1) % 10 == 0:
        print(i, loss)
print(model.validate_on_batch(**batch)["loss"].item())

0 2.8639614582061768
1 1.4194104671478271
2 0.77518230676651
3 0.686903178691864
4 0.7100348472595215
9 0.6913185119628906
19 0.573301374912262
29 0.5197886824607849
39 0.4690025746822357
49 0.4071226716041565
59 0.3376113176345825
69 0.2594606280326843
79 0.19111256301403046
89 0.13804244995117188
99 0.09743236005306244
109 0.07094649225473404
119 0.05127948150038719
129 0.03909169137477875
139 0.030640283599495888
149 0.02388760820031166
159 0.01988990791141987
169 0.01642182283103466
179 0.014226853847503662
189 0.012228230014443398
199 0.010675170458853245
0.009719926863908768


In [None]:
train_dataloader = DataLoader(X_train, batch_size=16, shuffle=True, collate_fn=collate_fn)
dev_dataloader = DataLoader(X_dev, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(X_test, batch_size=16, shuffle=False, collate_fn=collate_fn)
NEPOCHS = 5

model = MultilayerSymbolRNNTagger(
    vocab_size=len(vocabs["word"].symbols_),
    labels_number=len(vocabs["label"].symbols_),
    n_hidden=192,
    device="cuda"
)
best_val_acc = 0.0
checkpoint = "checkpoint_best.pt"
for epoch in range(NEPOCHS):
    do_epoch(model, train_dataloader, mode="train", epoch=epoch+1)
    epoch_metrics = do_epoch(model, dev_dataloader, mode="validate", epoch=epoch+1)
    if epoch_metrics["accuracy"] > best_val_acc:
        best_val_acc = epoch_metrics["accuracy"]
        torch.save(model.state_dict(), checkpoint)
        # print("Saving ")
model.load_state_dict(torch.load(checkpoint))
do_epoch(model, dev_dataloader, mode="validate", epoch="evaluate")
do_epoch(model, test_dataloader, mode="validate", epoch="evaluate")

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/103 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

{'correct': 49854,
 'total': 51371,
 'sent_correct': 778,
 'sent_total': 1355,
 'loss': 0.09620788308846599,
 'n_batches': 85,
 'accuracy': 0.9704697202701914,
 'sent_accuracy': 0.574169741697417}

In [None]:
def predict_with_model(model, X: SequenceDataset, batch_size=32, batch_collate_fn=collate_fn):
    model.eval()
    dataloader = DataLoader(X, batch_size=batch_size, shuffle=False, collate_fn=batch_collate_fn)
    answer = [None] * len(X)
    for batch in dataloader:
        with torch.no_grad():
            batch_answer = model(**batch)
        labels = batch_answer["labels"].cpu().numpy()
        # probs = batch_answer.cpu().numpy()
        # labels = probs.argmax(axis=-1)
        for index, curr_labels, curr_mask in zip(batch["index"], labels, batch["mask"].bool().cpu().numpy()):
            # curr_labels[curr_mask] -- только те элементы в curr_labels, где curr_mask=True
            answer[index] = np.take(X.vocabs["label"].symbols_, curr_labels[curr_mask])
    return answer

predictions = predict_with_model(model, X_test, batch_collate_fn=collate_fn_with_tags)
# for elem, label in zip(test_sents[39], predictions[39]):
#     print(elem['word'], elem['label'], label)
# print(predictions[39])

In [None]:
from datasets import load_metric

metric = load_metric('seqeval')
corr_labels = [[elem['label'] for elem in sent] for sent in test_sents]
results = metric.compute(references=corr_labels, predictions=predictions)
for key, value in results.items():
    print(key, value)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


LOC {'precision': 0.7419659735349716, 'recall': 0.724169741697417, 'f1': 0.7329598506069094, 'number': 1084}
MISC {'precision': 0.3746898263027295, 'recall': 0.4441176470588235, 'f1': 0.4064602960969044, 'number': 340}
ORG {'precision': 0.7207880434782609, 'recall': 0.7578571428571429, 'f1': 0.7388579387186629, 'number': 1400}
PER {'precision': 0.7843373493975904, 'recall': 0.8857142857142857, 'f1': 0.8319488817891374, 'number': 735}
overall_precision 0.703693861280893
overall_recall 0.7440292216914863
overall_f1 0.7232996449057635
overall_accuracy 0.9704697202701914


In [None]:
# words = "Tony Blair es el Primer Ministro de Gran Bretaña y vive en Londres.".split()
words = "En el Festival de Cine de Venecia de este año se presentará una película basada en el libro del famoso escritor americano Jack London \"Martin Eden\".".split()
# words = "Después de la llegada al poder de Napoleón I y la proclamación del Primer Imperio, la Marsellesa perdió su estatus como himno nacional de Francia .".split()
# words = "Elon Musk es el CEO de Tesla y SpaceX .".split()
# words = "Los bancos regionales de EE. UU. siguen bajo presión un año después del colapso de Silicon Valley Bank .".split()

sent = [{"word": word, "tag": "", "label": ""} for word in words]
X_sent = SequenceDataset([sent], fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=X_train.vocabs, add_begin=X_train.add_begin)
y_sent_pred = predict_with_model(model, X_sent)
for word, label in zip(words, y_sent_pred[0]):
    print(word, label)

En O
el O
Festival B-MISC
de I-MISC
Cine I-MISC
de I-MISC
Venecia I-MISC
de O
este O
año O
se O
presentará O
una O
película O
basada O
en O
el O
libro O
del O
famoso O
escritor O
americano O
Jack B-PER
London I-PER
"Martin I-PER
Eden". I-PER


# Тестирование и вывод

Улучшенная модель (посимвольная сеть) справляется лучше базовых моделей, а также модели, использующей pos-теги. Ниже приведены значения четырёх основных метрики качества на тестовой выборке для каждой из четырёх моделей (жирным выделены лучшие значения).

**MultilayerConvTagger**

overall_precision 0.669

overall_recall 0.643

overall_f1 0.655

overall_accuracy 0.957

**MultilayerRNNTagger**

overall_precision **0.734**

overall_recall 0.669

overall_f1 0.7

overall_accuracy 0.96

**MultilayerRNNTaggerWithTags**

overall_precision 0.719

overall_recall 0.688

overall_f1 0.703

overall_accuracy 0.962

**MultilayerSymbolRNNTagger**

overall_precision 0.693

overall_recall **0.745**

overall_f1 **0.718**

overall_accuracy **0.97**

Заметим, что наибольшая точность у рекуррентной сети. Она же показывала наилучшие результаты при тестировании на предложениях не из тестовой выборки (в сравнении со свёрточной сетью и сетью с добавлением тегов). Так, она (результаты трёх разных запусков) распознавала как персону Джека Лондона, Тони Блэра и Мартина Идэна, выделяла как MISC Венецианский кинофестиваль, чего другие модели сделать не смогли.

Теперь протестируем лучшую модель (посимвольную сеть) на 10 предложениях не из тестовой выборки (3 из них уже использовались выше после обучения и тестирования моделей).

In [None]:
sent1 = "Después de la llegada al poder de Napoleón I y la proclamación del Primer Imperio, la Marsellesa perdió su estatus como himno nacional de Francia ." #After Napoleon I came to power and the proclamation of the First Empire, the Marseillaise lost its status as the national anthem of France.
sent2 = "La diosa Afrodita es retratada por Venus de Milo ." #The goddess Aphrodite is portrayed by Venus de Milo
sent3 = "Se espera que Apple lance un nuevo iPhone el próximo mes ." # Apple is expected to release a new iPhone next month.
sent4 = "La Estatua de la Libertad se encuentra en la ciudad de Nueva York ." #The Statue of Liberty is located in New York City.
sent5 = "Elon Musk es el CEO de Tesla y SpaceX ." #Elon Musk is the CEO of Tesla and SpaceX.
sent6 = "La Segunda Guerra Mundial fue un conflicto que duró desde 1939 hasta 1945 ." #The Second World War was a conflict that lasted from 1939 to 1945.
sent7 = "El Día de la Independencia en India se celebra el 15 de agosto cuando entraron en vigor las disposiciones de la Ley de Independencia de la India ." #The Independence Day in India is celebrated on the 15 of August when the provisions of the Indian Independence Act came into effect.
sent8 = "Los bancos regionales de EE. UU. siguen bajo presión un año después del colapso de Silicon Valley Bank ." #U.S. regional banks remain under pressure a year after the collapse of Silicon Valley Bank
sent9 = "Durante la crisis del Covid, el apoyo del gobierno federal se disparó ." #During the Covid crisis federal government's support skyrocketed.
sent10 = "Los atletas de atletismo de Brasil, Ecuador, Perú y Portugal se someterán a pruebas con más frecuencia antes de los Juegos Olímpicos de París de este año debido a los programas antidopaje deficientes en el país ." #Track and field athletes from Brazil, Ecuador, Peru and Portugal will be tested more often ahead of this year’s Paris Olympics because of sub-standard anti-doping programs at home
sent_add = "Sergio gritó: ¡PARA! - y huyó tras sus camaradas."
sent_add_2 = "Por la noche, se tocaba música en vivo en el Restaurante."

for words in [sent1, sent2, sent3, sent4, sent5, sent6, sent7, sent8, sent9, sent10, sent_add, sent_add_2]:
  words = words.split()
  sent = [{"word": word, "tag": "", "label": ""} for word in words]
  X_sent = SequenceDataset([sent], fields={"word": "input_ids", "tag": "tags", "label": "labels"}, vocabs=X_train.vocabs, add_begin=X_train.add_begin)
  y_sent_pred = predict_with_model(model, X_sent)
  for word, label in zip(words, y_sent_pred[0]):
    print(word, label)
  print("\n")


Después O
de O
la O
llegada O
al O
poder O
de O
Napoleón B-LOC
I I-MISC
y O
la O
proclamación O
del O
Primer B-MISC
Imperio, I-MISC
la I-MISC
Marsellesa I-MISC
perdió O
su O
estatus O
como O
himno O
nacional O
de O
Francia B-LOC
. O


La O
diosa O
Afrodita B-ORG
es O
retratada O
por O
Venus B-PER
de I-PER
Milo I-PER
. O


Se O
espera O
que O
Apple B-ORG
lance O
un O
nuevo O
iPhone B-MISC
el O
próximo O
mes O
. O


La O
Estatua B-LOC
de I-LOC
la I-LOC
Libertad I-LOC
se O
encuentra O
en O
la O
ciudad O
de O
Nueva B-LOC
York I-LOC
. O


Elon O
Musk I-PER
es O
el O
CEO B-ORG
de O
Tesla B-LOC
y O
SpaceX B-LOC
. O


La O
Segunda B-MISC
Guerra I-PER
Mundial I-MISC
fue O
un O
conflicto O
que O
duró O
desde O
1939 O
hasta O
1945 O
. O


El O
Día O
de I-MISC
la I-MISC
Independencia I-MISC
en O
India B-LOC
se O
celebra O
el O
15 O
de O
agosto O
cuando O
entraron O
en O
vigor O
las O
disposiciones O
de O
la O
Ley B-MISC
de I-MISC
Independencia I-MISC
de I-MISC
la I-MISC
India I-MISC
. O


Los O
ba

Посимвольная модель хорошо справляется с примерами разного типа. Так, она успешно выделяет именованные сущности, состоящие из нескольких слов, присваивая им одну и ту же метку (*Juegos Olímpicos de París*, *Segunda Guerra Mundial*). На этих и других примерах видно, что модель научилась "соединять" слова, начинающиеся с заглавной буквы и разделённые предлогом типа *de* и артиклем типа *la*, в одну именованную сущность одного типа (*Ley de Independencia de la India*).
Также модель достаточно успешно распознают незнакомые ей имена (*Elon Musk*, *SpaceX*, *Covid**). Базовые модели и модель с добавлением тегов в таких случаях обычно пропускали незнакомое им имя.

\* модель обучена на новостных статьях 2000 года и этих слов видеть не могла

Выражения, которые пишутся с большой буквы, но при этом не относятся именованным сущностям, "ловят" модель (например, восклицание *¡PARA!* в последнем примере), однако они определяются моделью как MISC, что хорошо, так как это наиболее нейтральный вариант.  

Заглавная буква в начале предложения проблем у модели не вызывает.