In [0]:
import pandas as pd
import numpy as np

In [0]:
from collections import namedtuple
WordPos = namedtuple("WordPos", "word pos")

def get_sentences(filename, is_train=True):
    sentences = []
    with open(filename, "r", encoding='utf-8') as r:
        next(r)
        sentence = []
        for line in r:
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                sentence.append(WordPos(word, pos))
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

all_sentences = get_sentences('train.csv')

In [0]:
data = []

num_tag = 0
num_word = 0
word_to_ix = {}
tag_to_ix = {}

for i in range(len(all_sentences)):
    sent_tag = []
    sent_word = []
    for j in range(len(all_sentences[i])):
        word = all_sentences[i][j][0].lower()
        tag = all_sentences[i][j][1]
        sent_word.append(word)
        sent_tag.append(tag)
        if word not in word_to_ix:
            word_to_ix[word] = num_word
            num_word += 1
        if tag not in tag_to_ix:
            tag_to_ix[tag] = num_tag
            num_tag += 1
    data.append((sent_word, sent_tag))

In [65]:
size = len(data)
train_data = data[:int(size * 0.9)]
valid_data = data[int(size * 0.9):int(size * 0.95)]
test_data = data[int(size * 0.95):]
print(list(map(len, [train_data, valid_data, test_data])))

[43353, 2409, 2409]


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, BucketIterator
import torchtext
import random

SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
SRC = Field()
TAG = Field()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# my_data = torchtext.data.Dataset(data, fields=(('src', SRC), ('tag', TAG)))
# train, test, valid = torchtext.data.Iterator.splits((train_data, valid_data, test_data), batch_sizes=(32, 32, 32),
#                                                     sort_key=lambda x: len(x.SRC), device=device)

In [0]:
text = [sent for sent, tags in train_data]
tags = [tags for sent, tags in train_data]

SRC.build_vocab(text, min_freq=10)
TAG.build_vocab(tags)

In [69]:
SRC.vocab.itos[:10], len(SRC.vocab.itos)

(['<unk>', '<pad>', ',', '.', 'в', 'и', '"', '-', 'на', 'не'], 8061)

In [70]:
TAG.vocab.itos, len(TAG.vocab.itos)

(['<unk>',
  '<pad>',
  'NOUN',
  'PUNCT',
  'VERB',
  'ADJ',
  'ADP',
  'ADV',
  'PROPN',
  'PRON',
  'CONJ',
  'PART',
  'DET',
  'SCONJ',
  'NUM',
  'AUX',
  'X',
  'INTJ',
  'SYM'],
 19)

In [0]:
class PosTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super().__init__()
        
        
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sent):
        embedded = self.embeddings(sent)
        output, _ = self.rnn(embedded.view(len(sent), 1, -1))
        tag = self.tag(output.view(len(sent), -1))
        tag_scores = F.log_softmax(tag, dim=1)
        
        return tag_scores

In [0]:
def create_vector(field, sent):
    vector = [field.vocab.stoi[elem] for elem in sent]
    return torch.tensor(vector, dtype=torch.long, device=device)

In [0]:
from tqdm import tqdm_notebook

EMBEDDING_DIM = 32
HIDDEN_DIM = 32
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TAG.vocab)

model = PosTagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [0]:
def accuracy_score(test_data, model):
    true_pred = 0.0
    all_pred = 0.0

    for sent in test_data:
        words, tags = sent
        with torch.no_grad():
            inputs = create_vector(SRC, words)
            outputs = create_vector(TAG, tags).cpu().numpy()
            tag_scores = model(inputs)
            predict_tags = torch.max(tag_scores, dim=1)[1].cpu().numpy()

        true_pred += np.sum(outputs == predict_tags)
        all_pred += len(words)
        #print(np.sum(outputs == predict_tags), len(words))
    return round(true_pred / all_pred * 100, 3)


In [75]:
len(valid_data)

2409

In [82]:
for epoch in range(3):
    for i in tqdm_notebook(range(len(train_data))):

        sentence, tags = train_data[i]
        model.zero_grad()

        sentence = create_vector(SRC, sentence)
        targets = create_vector(TAG, tags)
        #print(targets)
        #print(sentence)

        #break
        tag_scores = model(sentence)

        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    
        if i % 10000 == 0:
            print(f'Num of epoch={epoch}, Valid Score={accuracy_score(valid_data, model)}')
    #break

HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Num of epoch=0, Valid Score=7.785
Num of epoch=0, Valid Score=68.486
Num of epoch=0, Valid Score=72.032
Num of epoch=0, Valid Score=74.713
Num of epoch=0, Valid Score=76.319


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Num of epoch=1, Valid Score=77.133
Num of epoch=1, Valid Score=78.24
Num of epoch=1, Valid Score=79.623
Num of epoch=1, Valid Score=80.39
Num of epoch=1, Valid Score=81.215


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Num of epoch=2, Valid Score=81.678
Num of epoch=2, Valid Score=82.133
Num of epoch=2, Valid Score=82.743
Num of epoch=2, Valid Score=83.155
Num of epoch=2, Valid Score=83.696


In [83]:
accuracy_score(test_data, model)

84.328