In [18]:
# Скачивание данных с kaggle по гайду: https://www.kaggle.com/general/74235
! pip install --upgrade kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c rupos2018
! mkdir data
! unzip rupos2018.zip -d data

Requirement already up-to-date: kaggle in /usr/local/lib/python3.6/dist-packages (1.5.10)
mkdir: cannot create directory ‘/root/.kaggle’: File exists
rupos2018.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  rupos2018.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


In [20]:
# загрузка предобученных эмбеддингов
! wget -O cc.ru.300.vec http://service.hucompute.org/embeddings/api/v1/embeddings/fasttext_crawl_cc.ru.300.vec/download

--2020-12-07 17:24:36--  http://service.hucompute.org/embeddings/api/v1/embeddings/fasttext_crawl_cc.ru.300.vec/download
Resolving service.hucompute.org (service.hucompute.org)... 141.2.89.20
Connecting to service.hucompute.org (service.hucompute.org)|141.2.89.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4536408847 (4.2G) [application/octet-stream]
Saving to: ‘cc.ru.300.vec’


2020-12-07 17:31:01 (11.2 MB/s) - ‘cc.ru.300.vec’ saved [4536408847/4536408847]



In [35]:
! pip install allennlp==0.9.0

Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
[?25l  Downloading https://files.pythonhosted.org/packages/56/aa/4ef5aa67a9a62505db124a5cb5262332d1d4153462eb8fd89c9fa41e5d92/urllib3-1.25.11-py2.py3-none-any.whl (127kB)
[K     |████████████████████████████████| 133kB 14.3MB/s 
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
Installing collected packages: urllib3
  Found existing installation: urllib3 1.26.2
    Uninstalling urllib3-1.26.2:
      Successfully uninstalled urllib3-1.26.2
Successfully installed urllib3-1.25.11


Далее основная часть ноутбука взята с семинарского занятия по allennlp

In [1]:
import torch
from src_allennlp.model import SimpleTagger
from src_allennlp.reader import RuPosReader
from allennlp.data.vocabulary import Vocabulary
from allennlp.common import Params
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, TokenCharactersEncoder
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer

Из всех импортов тут только два самописных класса - ридер и, собственно, сама модель.

In [2]:
reader = RuPosReader()
dataset = reader.read('data/train.csv')
vocab = Vocabulary.from_instances(dataset, pretrained_files={'tokens':'./cc.ru.300.vec'}, only_include_pretrained_words=False)
print(vocab)
print(vocab.get_index_to_token_vocabulary('labels'))

48171it [00:04, 11690.76it/s]
100%|██████████| 48171/48171 [00:05<00:00, 8801.97it/s]
100%|██████████| 2000000/2000000 [00:13<00:00, 149789.24it/s]


Vocabulary with namespaces:
 	Non Padded Namespaces: {'*tags', '*labels'}
 	Namespace: tokens, Size: 98882 
 	Namespace: token_characters, Size: 159 
 	Namespace: labels, Size: 17 

{0: 'NOUN', 1: 'PUNCT', 2: 'VERB', 3: 'ADJ', 4: 'ADP', 5: 'ADV', 6: 'PROPN', 7: 'PRON', 8: 'CONJ', 9: 'PART', 10: 'DET', 11: 'SCONJ', 12: 'NUM', 13: 'AUX', 14: 'X', 15: 'INTJ', 16: 'SYM'}


В ячейке выше мы прочитали датасет и сделали из него словарь

Реализована модель из статьи: [End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF](https://arxiv.org/pdf/1603.01354.pdf)

In [3]:
word_emb_dim = 300
char_emb_dim = 50
char_repr_dim = 100
cnn_encoder_filters_num = 32
hidden_dim = 300

params = Params({"pretrained_file": './cc.ru.300.vec', "embedding_dim": word_emb_dim, "trainable": False})
word_embed = Embedding.from_params(vocab, params)
char_embed = Embedding(num_embeddings=vocab.get_vocab_size('token_characters'), embedding_dim=char_emb_dim)
chars_cnn_encoder = CnnEncoder(embedding_dim=char_emb_dim, num_filters=cnn_encoder_filters_num, ngram_filter_sizes=(3,3), output_dim=char_repr_dim)
token_chars_encoder = TokenCharactersEncoder(char_embed, chars_cnn_encoder)
embedder = BasicTextFieldEmbedder({"tokens": word_embed, "chars": token_chars_encoder})
encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(embedder.get_output_dim(), hidden_dim, batch_first=True, bidirectional=True, num_layers=2))
model = SimpleTagger(vocab, embedder, encoder)

100%|██████████| 2000000/2000000 [00:22<00:00, 90230.83it/s] 


In [4]:
train_dataset = dataset[:-1000]
dev_dataset = dataset[-1000:]

In [5]:
device = torch.device('cuda')
model.to(device)
optimizer = torch.optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=256, sorting_keys=[("tokens", "num_tokens")], biggest_batch_first=True)
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience=5,
                  num_epochs=20,
                  cuda_device=0,
                  validation_metric="+fscore")

In [6]:
trainer.train()

accuracy: 0.6111, precision: 0.6111, recall: 0.6111, fscore: 0.6111, loss: 5362.5973 ||: 100%|██████████| 185/185 [00:39<00:00,  4.64it/s]
accuracy: 0.8841, precision: 0.8841, recall: 0.8841, fscore: 0.8841, loss: 1533.7587 ||: 100%|██████████| 4/4 [00:00<00:00,  7.81it/s]
accuracy: 0.9307, precision: 0.9307, recall: 0.9307, fscore: 0.9307, loss: 996.6652 ||: 100%|██████████| 185/185 [00:30<00:00,  6.02it/s]
accuracy: 0.9568, precision: 0.9568, recall: 0.9568, fscore: 0.9568, loss: 591.0114 ||: 100%|██████████| 4/4 [00:00<00:00, 11.99it/s]
accuracy: 0.9649, precision: 0.9649, recall: 0.9649, fscore: 0.9649, loss: 511.4359 ||: 100%|██████████| 185/185 [00:30<00:00,  6.09it/s]
accuracy: 0.9735, precision: 0.9735, recall: 0.9735, fscore: 0.9735, loss: 361.9126 ||: 100%|██████████| 4/4 [00:00<00:00, 11.04it/s]
accuracy: 0.9727, precision: 0.9727, recall: 0.9727, fscore: 0.9727, loss: 387.9391 ||: 100%|██████████| 185/185 [00:30<00:00,  6.03it/s]
accuracy: 0.9790, precision: 0.9790, recall:

{'best_epoch': 14,
 'best_validation_accuracy': 0.9862840018441679,
 'best_validation_fscore': 0.9862840175628662,
 'best_validation_loss': 199.59039306640625,
 'best_validation_precision': 0.9862840175628662,
 'best_validation_recall': 0.9862840175628662,
 'epoch': 18,
 'peak_cpu_memory_MB': 3638.384,
 'peak_gpu_0_memory_MB': 3043,
 'training_accuracy': 0.9965824150373739,
 'training_cpu_memory_MB': 3638.384,
 'training_duration': '0:10:11.443372',
 'training_epochs': 18,
 'training_fscore': 0.9965823888778687,
 'training_gpu_0_memory_MB': 3043,
 'training_loss': 44.51641163181614,
 'training_precision': 0.9965823888778687,
 'training_recall': 0.9965823888778687,
 'training_start_epoch': 0,
 'validation_accuracy': 0.9844398340248963,
 'validation_fscore': 0.9844398498535156,
 'validation_loss': 274.47485542297363,
 'validation_precision': 0.9844398498535156,
 'validation_recall': 0.9844398498535156}

In [7]:
model.eval()
results = []
with torch.no_grad():    
    labels =  model.forward_on_instance(dev_dataset[1])['labels']

In [8]:
for token, label in zip(dev_dataset[1]['tokens'].tokens,labels):
    print(token, label)

Сегодня ADV
Великий ADJ
Октябрь PROPN
прибавил VERB
к ADP
своей DET
биографии NOUN
еще ADV
один NUM
год NOUN
. PUNCT
