[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12ceSx8nxm7vRb4vacsjBHGdz6nXh3tCc?usp=sharing)

# Техническая часть

In [2]:
!pip install navec
!pip install razdel



In [3]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

train_text_dwld = drive.CreateFile({'id':'11pMg8DdZ56edNBs647k0qDjOPmeFNLPM'})
train_text_dwld.GetContentFile('train.csv')

train_text_dwld = drive.CreateFile({'id':'1o0ruW9qzbHqm1buCQQcknyIghHeRBXym'})
train_text_dwld.GetContentFile('navec_hudlit_v1_12B_500K_300d_100q.tar')

In [26]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import io
import collections
import seaborn as sns
import matplotlib.pyplot as plt
import random

import math
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from navec import Navec
from razdel import tokenize

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torchtext.data import Field, Example, Dataset, BucketIterator

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Данные

In [5]:
def get_sentences(filename, is_train):
    sentences = []
    with io.open(filename, "r", encoding='utf-8') as r:
        # Пропускаем заголовок
        next(r)
        sentence = [] # будем заполнять список предложений
        for line in r:
            # предложения отделены по '\n'
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                # Формат: индекс\tномер_в_предложении\tсловоформа\tPOS#Грамемы
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                # gram = line.strip().split("\t")[3].split("#")[1]
                sentence.append((word, pos))
            else:
                word = line.strip().split("\t")[2]
                sentence.append(word)
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

train = get_sentences('train.csv', True)

In [6]:
for word, tag in train[0][:10]:
    print('{:15}\t{}'.format(word, tag))

А              	CONJ
ведь           	PART
для            	ADP
конкретных     	ADJ
изделий        	NOUN
зачастую       	ADV
нужен          	ADJ
монокристалл   	NOUN
не             	PART
только         	PART


## Подготовка к torchtext

In [27]:
def razdel_tokenizer(text):
    tokens = list(tokenize(text))
    tokens = [_.text for _ in tokens]
    return tokens

In [7]:
def read_data(sentences):
    examples = []
    fields = {'sentence_labels': ('labels', label_field),
              'sentence_tokens': ('text', text_field)}
    
    for sentence in sentences: 
        tokens = [t[0] for t in sentence]
        labels = [t[1] for t in sentence]
        
        e = Example.fromdict({"sentence_labels": labels, "sentence_tokens": tokens},
                             fields=fields)
        examples.append(e)
    
    return Dataset(examples, fields=[('labels', label_field), ('text', text_field)])

In [30]:
text_field = Field(sequential=True, tokenize=razdel_tokenizer, include_lengths=True)
label_field = Field(sequential=True, is_target=True)

train_data, valid_data = train_test_split(train, test_size=0.1)

train_data = read_data(train_data)
val_data = read_data(valid_data)

print(train_data[0].text)
print(train_data[0].labels)

['Вскоре', 'мы', 'познакомились', ',', 'и', 'я', 'убедился', 'в', 'том', ',', 'что', 'кроме', 'привлекательной', 'внешности', 'она', 'обладает', 'редкой', 'способностью', 'очень', 'здраво', 'рассуждать', 'о', 'самых', 'разных', 'вещах', 'и', 'событиях', '.']
['ADV', 'PRON', 'VERB', 'PUNCT', 'CONJ', 'PRON', 'VERB', 'ADP', 'PRON', 'PUNCT', 'SCONJ', 'ADP', 'ADJ', 'NOUN', 'PRON', 'VERB', 'ADJ', 'NOUN', 'ADV', 'ADV', 'VERB', 'ADP', 'ADJ', 'ADJ', 'NOUN', 'CONJ', 'NOUN', 'PUNCT']


In [41]:
text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, val_data),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    batch_size = 64,
    device = device)

In [32]:
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

navec['привет'].shape

(300,)

## Строим embedding

In [33]:
known_count = 0
unk_word = []
embeddings = np.zeros((len(text_field.vocab), 300))
for idx, word in tqdm(enumerate(text_field.vocab.itos), total=len(text_field.vocab)):
    word = word.lower()
    if word in navec:
        embeddings[idx] = navec[word]
        known_count += 1
    else:
        unk_word.append(word)
print()
print(f'navec знает {known_count} слов')

100%|██████████| 101296/101296 [00:01<00:00, 85759.63it/s]


navec знает 89134 слов





In [35]:
unk_word[0:25]

[',',
 '.',
 '"',
 '-',
 ':',
 ')',
 '(',
 '?',
 '!',
 '…',
 '%',
 ';',
 '10',
 'а_также',
 'потому_что',
 'то_есть',
 '1',
 '20',
 '15',
 '2',
 '5',
 'с_помощью',
 'во_время',
 '3',
 '30']

Не понятно, что делать с такими словами.

# Model

В качестве модели, буду использовать обычную bi-GRU с pack_padded_sequence.

Будем дообучать вектора, добавим `freeze=False` в `nn.Embedding.from_pretrained()`  



In [39]:
class GRUTagger(nn.Module):
    def __init__(self, embeddings, embeddings_dim, tagset_size, gru_hidden_dim=64, gru_layers_count=1):
        super().__init__()
        self.embed = nn.Embedding.from_pretrained(torch.FloatTensor(embeddings), freeze=False)
        self.embed_dim = embeddings_dim
        self.dropout = nn.Dropout(0.3)
        self.gru = nn.GRU(self.embed_dim, gru_hidden_dim,
                             num_layers=gru_layers_count,
                             bidirectional=True)
        self.output_layer = nn.Linear(gru_hidden_dim * 2,
                                      tagset_size)

    def forward(self, text, len_text):
        out = self.embed(text)
        out = self.dropout(out)
        out = pack_padded_sequence(out, len_text)
        out, _ = self.gru(out)
        out, _ = pad_packed_sequence(out)
        out = self.output_layer(out)
        return out

In [40]:
def remove_predictions_for_masked_items(predicted_labels, correct_labels): 
    predicted_labels_without_mask = []
    correct_labels_without_mask = []
    for p, c in zip(predicted_labels, correct_labels):
        if c > 1:
            predicted_labels_without_mask.append(p)
            correct_labels_without_mask.append(c)
            
    return predicted_labels_without_mask, correct_labels_without_mask

def do_epoch(model, criterion, data, n_classes, optimizer=None, name=None):
    epoch_loss = 0
    epoch_f1 = 0    
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=len(data)) as progress_bar:
            for batch in data:
                text_len, cur_batch_size = batch.text[0].shape
                predict = model(batch.text[0].to(device), batch.text[1].to(device)).view(cur_batch_size*text_len, n_classes)
                label = batch.labels.view(cur_batch_size*text_len)

                loss = criterion(predict, label)

                epoch_loss += loss.item()

                if optimizer:
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                _, pred = torch.max(predict, 1)

                predict_label = list(pred.cpu().numpy())
                correct_label = list(label.cpu().numpy())

                predict_label, correct_label = remove_predictions_for_masked_items(predict_label, 
                                                                                   correct_label)
                f1_s = f1_score(predict_label, correct_label, average="micro")

                epoch_loss += loss.item()
                epoch_f1 += f1_s

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}, f1 = {:.2%}'.format(
                    name, loss.item(), f1_s)
                )
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}, f1 = {:.2%}'.format(
                name, epoch_loss / len(data), epoch_f1 / len(data))
            )

    return epoch_loss


def fit(model, criterion, optimizer, train_data, n_classes, epochs_count=1, val_data=None):
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss = do_epoch(model, criterion, train_data, n_classes, optimizer, name_prefix + 'Train:')
        
        if val_data:
            val_loss = do_epoch(model, criterion, val_data, n_classes, None, name_prefix + '  Val:')

Будем игнорировать pad, чтобы не обмануть себя и не предсказывать только pad.

In [43]:
n_classes = len(label_field.vocab)

model = GRUTagger(
    embeddings=embeddings,
    embeddings_dim=300,
    tagset_size=n_classes,
    gru_hidden_dim=256,
    ).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=label_field.vocab.stoi[label_field.pad_token]).to(device)
optimizer = optim.Adam(model.parameters())

fit(model, criterion, optimizer, train_data=train_iterator, n_classes=n_classes, 
    epochs_count=9, val_data=valid_iterator)

[1 / 9] Train: Loss = 0.87004, f1 = 85.84%: 100%|██████████| 678/678 [00:25<00:00, 26.22it/s]
[1 / 9]   Val: Loss = 0.42411, f1 = 92.71%: 100%|██████████| 76/76 [00:00<00:00, 88.55it/s]
[2 / 9] Train: Loss = 0.22525, f1 = 96.26%: 100%|██████████| 678/678 [00:25<00:00, 26.16it/s]
[2 / 9]   Val: Loss = 0.40260, f1 = 93.65%: 100%|██████████| 76/76 [00:00<00:00, 92.34it/s]
[3 / 9] Train: Loss = 0.11970, f1 = 98.03%: 100%|██████████| 678/678 [00:26<00:00, 25.91it/s]
[3 / 9]   Val: Loss = 0.42744, f1 = 93.80%: 100%|██████████| 76/76 [00:00<00:00, 90.36it/s]
[4 / 9] Train: Loss = 0.07792, f1 = 98.71%: 100%|██████████| 678/678 [00:26<00:00, 25.96it/s]
[4 / 9]   Val: Loss = 0.41636, f1 = 94.49%: 100%|██████████| 76/76 [00:00<00:00, 93.45it/s]
[5 / 9] Train: Loss = 0.05467, f1 = 99.11%: 100%|██████████| 678/678 [00:26<00:00, 25.91it/s]
[5 / 9]   Val: Loss = 0.51616, f1 = 93.67%: 100%|██████████| 76/76 [00:00<00:00, 91.98it/s]
[6 / 9] Train: Loss = 0.03948, f1 = 99.36%: 100%|██████████| 678/678 [