In [1]:
!git clone https://github.com/Ryzhtus/morphology-homeworks

Cloning into 'abbyy-nlp-course'...
remote: Enumerating objects: 17, done.[K
remote: Total 17 (delta 0), reused 0 (delta 0), pack-reused 17[K
Unpacking objects: 100% (17/17), done.


In [2]:
!pip install navec

Collecting navec
  Downloading https://files.pythonhosted.org/packages/bc/c1/771ec5565f0ce24874d7fd325b429f9caa80517a40d2e4ce5705120591f3/navec-0.10.0-py3-none-any.whl
Installing collected packages: navec
Successfully installed navec-0.10.0


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler

import numpy as np

from navec import Navec

from tqdm import tqdm
from sklearn.metrics import f1_score

In [4]:
class Sentence():
    def __init__(self):
        self.tokens = []
        self.pos_tags = []
        self.grams = []

In [5]:
def read_dataset(dataset, mode):
    sentences = []
    
    with open(dataset, mode='r', encoding='utf-8') as data:
        # Пропускаем заголовок
        next(data)
        
        sentence = Sentence() # будем заполнять список предложений
        
        for row in data:
            row = row.strip()
            if len(row) != 0:
                row = row.split('\t')

                if mode == 'train':
                    _, _, token, pos_gram = row 
                    pos, gram = pos_gram.split('#')

                else:
                    _, _, token = row
                    pos, gram = '<UNK>', '<UNK>'

                sentence.tokens.append(token)
                sentence.pos_tags.append(pos)
                sentence.grams.append(gram)

            else:
                if len(sentence.tokens) > 0:
                    sentences.append(sentence)
                    sentence = Sentence()
                
        if len(sentence.tokens) > 0:
            sentence.append(sentence)
            
    return sentences

In [6]:
train = read_dataset('/content/morphology-homeworks/pos-tagging/data/train.csv', 'train')
test = read_dataset('/content/morphology-homeworks/pos-tagging/data/test.csv', 'test')

In [7]:
def get_vocabulary_and_indexer(data):
    vocabulary = set()
    vocabulary.add('<PAD>')
    
    token2index = {}
    index2token = {}
    
    token2index['<PAD>'] = 0
    index2token[0] = '<PAD>'
    index = 1
    
    for sentence in data:
        for token in sentence.tokens:
            vocabulary.add(token)
    
    for token in vocabulary:
        if token == '<PAD>':
              pass
        else:
            token2index[token] = index
            index2token[index] = token
            index += 1

    return vocabulary, token2index, index2token

def get_tags_and_indexer(data):
    tags = set()
    tags.add('<PAD>')
    
    tag2index = {}
    tag2index['<PAD>'] = 0
    
    index2tag = {}
    index2tag[0] = '<PAD>'

    index = 1
    for sentence in data:
        for tag in sentence.pos_tags:
            tags.add(tag)

    for tag in tags:
        if tag == '<PAD>':
            pass
        else:
            tag2index[tag] = index
            index2tag[index] = tag 
            index += 1

    return tags, tag2index, index2tag

def get_chars_and_indexer(data):
    chars = set()
    chars.add('<PAD>')
    indexer = {}
    indexer['<PAD>'] = 0
    index = 1
    for sentence in data:
        for token in sentence.tokens:
            for char in token:
                chars.add(char)
                
    for char in chars:     
        if char == '<PAD>':
            pass
        else:   
            indexer[char] = index
            index += 1

    return chars, indexer

In [8]:
train_vocab, train_token_index, train_index_token = get_vocabulary_and_indexer(train)
train_tags, train_tag_index, train_index_tag = get_tags_and_indexer(train)
train_chars, train_char_index = get_chars_and_indexer(train)

test_vocab, test_token_index, test_index_token = get_vocabulary_and_indexer(test)
test_tags, test_tag_index, test_index_tag = get_tags_and_indexer(test)
test_chars, test_char_index = get_chars_and_indexer(test)

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class CustomDataset(Dataset):
    def __init__(self, sentences, device, token_indexer, tag_indexer, char_indexer):
        self.sentences = sentences
        self.max_pad_length = 200
        self.device = device
        self.token_indexer = token_indexer
        self.tag_indexer = tag_indexer
        self.char_indexer = char_indexer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        return self.get_tensor_from_token(self.sentences[item]), self.get_tensor_from_tag(self.sentences[item]), self.get_tensor_from_char(self.sentences[item])

    def get_tensor_from_token(self, sentence):
        indicies = [self.token_indexer.get(token) for token in sentence.tokens]
        if len(indicies) < self.max_pad_length:
            indicies += [0] * (self.max_pad_length - len(indicies))
        else:
            indicies = indicies[:self.max_pad_length] 

        return torch.LongTensor(indicies)

    def get_tensor_from_tag(self, sentence):
        indicies = [self.tag_indexer.get(tag) for tag in sentence.pos_tags]
        if len(indicies) < self.max_pad_length:
            indicies += [0] * (self.max_pad_length - len(indicies))
        else:
            indicies = indicies[:self.max_pad_length] 

        return torch.LongTensor(indicies)

    def get_tensor_from_char(self, sentence):
        indicies = []
        char_indicies = []
        for token in sentence.tokens:
            for char in token:
                indicies.append(self.char_indexer.get(char))
        
        if len(indicies) < self.max_pad_length:
            indicies += [0] * (self.max_pad_length - len(indicies))
        else:
            indicies = indicies[:self.max_pad_length] 

        return torch.LongTensor(indicies)

In [10]:
dev = train[30000: 40000]
train = train[:30000]

In [11]:
TrainDataset = CustomDataset(train, device, train_token_index, train_tag_index, train_char_index)
TrainDataloader = DataLoader(TrainDataset, 16)

DevDataset = CustomDataset(dev, device, train_token_index, train_tag_index, train_char_index)
DevDataloader = DataLoader(DevDataset, 16)

TestDataset = CustomDataset(test, device, test_token_index, test_tag_index, test_char_index)
TestDataloader = DataLoader(TestDataset, 16)

In [12]:
path = '/content/abbyy-nlp-course/navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(path)

WORD_EMBEDDING_DIM = 300

embeddings = np.zeros((len(train_vocab), WORD_EMBEDDING_DIM))
for idx, word in enumerate(train_vocab):
    word = word.lower()
    if word in navec:
        embeddings[idx] = navec[word]

In [13]:
class DualLSTMTagger(nn.Module):
    def __init__(self, embeddings, word_embedding_dim, word_hidden_dim, char_embedding_dim, char_hidden_dim, word_vocab_size, char_vocab_size, tag_vocab_size):
        super(DualLSTMTagger, self).__init__()

        self.word_embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embeddings), freeze=False)
        
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim)
        
        self.lstm = nn.LSTM(word_embedding_dim + char_hidden_dim, word_hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(word_hidden_dim * 2, tag_vocab_size)
        
    def forward(self, words, chars):
        word_embeddings = self.word_embedding(words)
        char_embeddings = self.char_embedding(chars)

        char_lstm_out, (char_lstm_hidden, char_cell_hidden) = self.char_lstm(char_embeddings)

        combined = torch.cat((word_embeddings, char_lstm_out), 2)
        
        lstm_out, _ = self.lstm(combined)
        tag_space = self.hidden2tag(lstm_out)
      
        tag_scores = F.log_softmax(tag_space, dim=1)
        
        return tag_scores

In [14]:
def remove_predictions_for_masked_items(predicted_labels, correct_labels): 
    predicted_labels_without_mask = []
    correct_labels_without_mask = []
    predicted_labels_one_array = []
    correct_labels_one_array = []
    for i in range(len(predicted_labels)):
        predicted_labels_one_array += list(predicted_labels[i])
        correct_labels_one_array += list(correct_labels[i])

    for p, c in zip(predicted_labels_one_array, correct_labels_one_array):
        if c > 1:
            predicted_labels_without_mask.append(p)
            correct_labels_without_mask.append(c)
            
    return predicted_labels_without_mask, correct_labels_without_mask

In [15]:
class Trainer:
    def __init__(self, model: nn.Module, train_iterator, dev_iterator, lr=2e-5, device=device):
        self.criterion = nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        self.train_iterator = train_iterator
        self.dev_iterator = dev_iterator
        self.model = model.to(device)

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        total = 0
        correct = 0
        f1_epoch = 0
        for batch_idx, (tokens, pos_tags, chars) in enumerate(self.train_iterator):
            self.optimizer.zero_grad()
            tokens = tokens.to(device)
            chars = chars.to(device)
            pos_tags = pos_tags.to(device)

            logits = self.model(tokens, chars)
            loss = self.criterion(logits.view(-1, logits.size(-1)), pos_tags.view(-1))
            loss.backward()
            self.optimizer.step()
            total_loss += loss.item()

            mask = (tokens != 0).to(torch.long)
            pred = torch.argmax(logits, dim=-1)
            correct += ((pred == pos_tags)*mask).sum().item()
            total += mask.sum().item()

            predict_label = list(pred.cpu().numpy())
            correct_label = list(pos_tags.cpu().numpy())

            predict_label, correct_label = remove_predictions_for_masked_items(predict_label, correct_label)
            f1_batch = f1_score(predict_label, correct_label, average="micro")
            f1_epoch += f1_batch

            if (batch_idx + 1) % 400 == 0:
                print('Train-Batch-Loss: {:.4f}, Accuracy {:.4f}, F1-Score {:.4f}, Batch: {}/{}'.format(total_loss / (batch_idx + 1), correct / total, 
                                                                                                    f1_batch, batch_idx + 1, len(self.train_iterator)))
        print('-' * 10)
        print('Train-Loss: {:.4f}, Accuracy {:.4f}, F1-Score {:.4f}'.format(total_loss / (batch_idx + 1), correct / total, f1_epoch / len(self.train_iterator)))

    def test_epoch(self):
        with torch.no_grad():
            self.model.eval()
            total_loss = 0
            total = 0
            correct = 0
            f1_epoch = 0
            for batch_idx, (tokens, pos_tags, chars) in enumerate(self.dev_iterator):
                tokens = tokens.to(device)
                chars = chars.to(device)
                pos_tags = pos_tags.to(device)
                logits = self.model(tokens, chars)
                loss = self.criterion(logits.view(-1, logits.size(-1)), pos_tags.view(-1))
                total_loss += loss.item()

                mask = (tokens != 0).to(torch.long)
                pred = torch.argmax(logits, dim=-1)
                correct += ((pred == pos_tags) * mask).sum().item()
                total += mask.sum().item()

                predict_label = list(pred.cpu().numpy())
                correct_label = list(pos_tags.cpu().numpy())

                predict_label, correct_label = remove_predictions_for_masked_items(predict_label, correct_label)
                f1_batch = f1_score(predict_label, correct_label, average="micro")
                f1_epoch += f1_batch

            print('-' * 10)
            print('Eval-Loss: {:.4f}, Accuracy {:.4f}, F1-Score {:.4f}'.format(total_loss / (batch_idx + 1), correct / total, f1_epoch / len(self.dev_iterator)))

In [16]:
WORD_EMBEDDING_DIM = 300
CHAR_EMBEDDING_DIM = 128
WORD_HIDDEN_DIM = 1024
CHAR_HIDDEN_DIM = 1024
EPOCHS = 70

model = DualLSTMTagger(embeddings, WORD_EMBEDDING_DIM, WORD_HIDDEN_DIM, CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, len(train_vocab), len(train_chars), len(train_tags))
trainer = Trainer(model, TrainDataloader, DevDataloader)

In [17]:
for epoch in tqdm(range(10)):
    print()
    trainer.train_epoch()
    trainer.test_epoch()

  0%|          | 0/10 [00:00<?, ?it/s]


Train-Batch-Loss: 0.8850, Accuracy 0.1599, F1-Score 0.2636, Batch: 400/1875
Train-Batch-Loss: 0.5600, Accuracy 0.1861, F1-Score 0.3067, Batch: 800/1875
Train-Batch-Loss: 0.4375, Accuracy 0.2311, F1-Score 0.4458, Batch: 1200/1875
Train-Batch-Loss: 0.3715, Accuracy 0.2759, F1-Score 0.4938, Batch: 1600/1875
----------
Train-Loss: 0.3397, Accuracy 0.3010, F1-Score 0.3374


 10%|█         | 1/10 [05:48<52:19, 348.86s/it]

----------
Eval-Loss: 0.1516, Accuracy 0.4664, F1-Score 0.5223

Train-Batch-Loss: 0.1462, Accuracy 0.4958, F1-Score 0.5969, Batch: 400/1875
Train-Batch-Loss: 0.1384, Accuracy 0.5189, F1-Score 0.5879, Batch: 800/1875
Train-Batch-Loss: 0.1318, Accuracy 0.5371, F1-Score 0.6750, Batch: 1200/1875
Train-Batch-Loss: 0.1262, Accuracy 0.5540, F1-Score 0.7068, Batch: 1600/1875
----------
Train-Loss: 0.1225, Accuracy 0.5639, F1-Score 0.6267


 20%|██        | 2/10 [11:59<47:23, 355.38s/it]

----------
Eval-Loss: 0.0996, Accuracy 0.6341, F1-Score 0.6934

Train-Batch-Loss: 0.0969, Accuracy 0.6477, F1-Score 0.7132, Batch: 400/1875
Train-Batch-Loss: 0.0930, Accuracy 0.6581, F1-Score 0.6645, Batch: 800/1875
Train-Batch-Loss: 0.0899, Accuracy 0.6678, F1-Score 0.7500, Batch: 1200/1875
Train-Batch-Loss: 0.0873, Accuracy 0.6774, F1-Score 0.7747, Batch: 1600/1875
----------
Train-Loss: 0.0856, Accuracy 0.6824, F1-Score 0.7312


 30%|███       | 3/10 [18:08<41:56, 359.55s/it]

----------
Eval-Loss: 0.0753, Accuracy 0.7147, F1-Score 0.7524

Train-Batch-Loss: 0.0738, Accuracy 0.7241, F1-Score 0.7907, Batch: 400/1875
Train-Batch-Loss: 0.0718, Accuracy 0.7276, F1-Score 0.7125, Batch: 800/1875
Train-Batch-Loss: 0.0703, Accuracy 0.7317, F1-Score 0.8000, Batch: 1200/1875
Train-Batch-Loss: 0.0691, Accuracy 0.7357, F1-Score 0.7840, Batch: 1600/1875
----------
Train-Loss: 0.0682, Accuracy 0.7379, F1-Score 0.7708


 40%|████      | 4/10 [24:18<36:15, 362.65s/it]

----------
Eval-Loss: 0.0640, Accuracy 0.7510, F1-Score 0.7763

Train-Batch-Loss: 0.0626, Accuracy 0.7607, F1-Score 0.8256, Batch: 400/1875
Train-Batch-Loss: 0.0615, Accuracy 0.7626, F1-Score 0.7316, Batch: 800/1875
Train-Batch-Loss: 0.0605, Accuracy 0.7647, F1-Score 0.8125, Batch: 1200/1875
Train-Batch-Loss: 0.0599, Accuracy 0.7672, F1-Score 0.7994, Batch: 1600/1875
----------
Train-Loss: 0.0593, Accuracy 0.7683, F1-Score 0.7916


 50%|█████     | 5/10 [30:30<30:26, 365.33s/it]

----------
Eval-Loss: 0.0576, Accuracy 0.7725, F1-Score 0.7918

Train-Batch-Loss: 0.0559, Accuracy 0.7827, F1-Score 0.8372, Batch: 400/1875
Train-Batch-Loss: 0.0551, Accuracy 0.7842, F1-Score 0.7572, Batch: 800/1875
Train-Batch-Loss: 0.0544, Accuracy 0.7857, F1-Score 0.8125, Batch: 1200/1875
Train-Batch-Loss: 0.0539, Accuracy 0.7878, F1-Score 0.8210, Batch: 1600/1875
----------
Train-Loss: 0.0535, Accuracy 0.7886, F1-Score 0.8070


 60%|██████    | 6/10 [36:41<24:28, 367.10s/it]

----------
Eval-Loss: 0.0530, Accuracy 0.7891, F1-Score 0.8050

Train-Batch-Loss: 0.0510, Accuracy 0.8005, F1-Score 0.8450, Batch: 400/1875
Train-Batch-Loss: 0.0503, Accuracy 0.8017, F1-Score 0.7732, Batch: 800/1875
Train-Batch-Loss: 0.0497, Accuracy 0.8031, F1-Score 0.8125, Batch: 1200/1875
Train-Batch-Loss: 0.0493, Accuracy 0.8048, F1-Score 0.8210, Batch: 1600/1875
----------
Train-Loss: 0.0490, Accuracy 0.8056, F1-Score 0.8210


 70%|███████   | 7/10 [42:50<18:23, 367.81s/it]

----------
Eval-Loss: 0.0494, Accuracy 0.8034, F1-Score 0.8167

Train-Batch-Loss: 0.0469, Accuracy 0.8162, F1-Score 0.8527, Batch: 400/1875
Train-Batch-Loss: 0.0464, Accuracy 0.8170, F1-Score 0.7891, Batch: 800/1875
Train-Batch-Loss: 0.0459, Accuracy 0.8182, F1-Score 0.8167, Batch: 1200/1875
Train-Batch-Loss: 0.0456, Accuracy 0.8197, F1-Score 0.8302, Batch: 1600/1875
----------
Train-Loss: 0.0453, Accuracy 0.8205, F1-Score 0.8337


 80%|████████  | 8/10 [49:00<12:16, 368.49s/it]

----------
Eval-Loss: 0.0465, Accuracy 0.8153, F1-Score 0.8271

Train-Batch-Loss: 0.0436, Accuracy 0.8299, F1-Score 0.8721, Batch: 400/1875
Train-Batch-Loss: 0.0431, Accuracy 0.8302, F1-Score 0.8019, Batch: 800/1875
Train-Batch-Loss: 0.0426, Accuracy 0.8311, F1-Score 0.8167, Batch: 1200/1875
Train-Batch-Loss: 0.0424, Accuracy 0.8324, F1-Score 0.8549, Batch: 1600/1875
----------
Train-Loss: 0.0421, Accuracy 0.8330, F1-Score 0.8443


 90%|█████████ | 9/10 [55:14<06:09, 369.96s/it]

----------
Eval-Loss: 0.0441, Accuracy 0.8242, F1-Score 0.8349

Train-Batch-Loss: 0.0407, Accuracy 0.8408, F1-Score 0.8837, Batch: 400/1875
Train-Batch-Loss: 0.0403, Accuracy 0.8411, F1-Score 0.8147, Batch: 800/1875
Train-Batch-Loss: 0.0399, Accuracy 0.8418, F1-Score 0.8333, Batch: 1200/1875
Train-Batch-Loss: 0.0397, Accuracy 0.8430, F1-Score 0.8704, Batch: 1600/1875
----------
Train-Loss: 0.0395, Accuracy 0.8436, F1-Score 0.8536


100%|██████████| 10/10 [1:01:25<00:00, 368.55s/it]

----------
Eval-Loss: 0.0421, Accuracy 0.8318, F1-Score 0.8414



