В соревновании на каггл эта модель получила score 0.74

In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


if torch.cuda.is_available():
    from torch.cuda import FloatTensor, LongTensor
else:
    from torch import FloatTensor, LongTensor

np.random.seed(42)

In [2]:
TRAIN_FILENAME = "data/train.csv"
TEST_FILENAME = "data/test.csv"

In [3]:
from collections import namedtuple
WordForm = namedtuple("WordForm", "word pos gram")

def get_sentences(filename, is_train):
    sentences = []
    with open(filename, "r", encoding='utf-8') as r:
        next(r)
        sentence = []
        for line in r:
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                gram = line.strip().split("\t")[3].split("#")[1]
                sentence.append(WordForm(word, pos, gram))
            else:
                word = line.strip().split("\t")[2]
                sentence.append(word)
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

In [151]:
train = get_sentences(TRAIN_FILENAME, True)
test = get_sentences(TEST_FILENAME, False)

In [152]:
print(len(test))

12380


In [5]:
import nltk
from sklearn.cross_validation import train_test_split



Пример размеченного предложения:

In [6]:
for word, pos, gram in train[0]:
    print('{:15}\t{}'.format(word, gram))

А              	_
ведь           	_
для            	_
конкретных     	Case=Gen|Degree=Pos|Number=Plur
изделий        	Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur
зачастую       	Degree=Pos
нужен          	Degree=Pos|Gender=Masc|Number=Sing|Variant=Brev
монокристалл   	Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
не             	_
только         	_
крупный        	Case=Nom|Degree=Pos|Gender=Masc|Number=Sing
,              	_
но             	_
и              	_
заданной       	Aspect=Perf|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass
формы          	Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing
,              	_
например       	Degree=Pos
"              	_
стакан         	Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
"              	_
,              	_
"              	_
тройник        	Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
"              	_
(              	_
элемент        	Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
трубопровода   	Animacy=Inan|Case=Gen

In [7]:
grammems = dict()
pos_tags = set()
words = set()
for sentence in train:
    for word, pos, gram in sentence:
        words.add(word)
        pos_tags.add(pos)
        for pair in gram.split('|'):
            if pair == '_':
                continue
            key = pair.split('=')[0]
            value = pair.split('=')[1]
            if not key in grammems:
                grammems[key] = set()
            grammems[key].add(value)
            
word2ind = {word: ind + 1 for ind, word in enumerate(words)}
word2ind['<pad>'] = 0

tag2ind = {tag: ind + 1 for ind, tag in enumerate(pos_tags)}
tag2ind['<pad>'] = 0

In [8]:
i = 1
gram2ind = {}
gram2ind[('_','')] = 0
for gram in grammems:
    for value in grammems[gram]:
        gram2ind[(gram, value)] = i
        i += 1
        
gram_space = i
gram_space

36

In [81]:
def convert_data(data, word2ind, tag2ind, gram2ind):
    X = [[word2ind.get(word, 0) for word, pos, gram in sample] for sample in data]
    y = [[tag2ind[tag] for word, tag, gram in sample] for sample in data]
    gramms = []
    for sample in data:
        sentence = []
        for word, pos, grams in sample:
            vect = np.zeros(len(gram2ind))
            for gram in grams.split('|'):
                vect[gram2ind.get((gram.split('=')[0], gram.split('=')[-1]), 0)] = 1
            sentence.append(vect)
        gramms.append(sentence)
    return X, y, np.array(gramms)

X_train, y_train, g_train = convert_data(train, word2ind, tag2ind, gram2ind)

In [82]:
def iterate_batches(data, batch_size):
    X, y, gramms = data
    n_samples = len(X)

    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        
        batch_indices = indices[start:end]
        
        max_sent_len = max(len(X[ind]) for ind in batch_indices)
        X_batch = np.zeros((max_sent_len, len(batch_indices)))
        y_batch = np.zeros((max_sent_len, len(batch_indices)))
        g_batch = np.zeros((max_sent_len, len(batch_indices), len(gramms[0][0])))
        for batch_ind, sample_ind in enumerate(batch_indices):
            X_batch[:len(X[sample_ind]), batch_ind] = X[sample_ind]
            y_batch[:len(y[sample_ind]), batch_ind] = y[sample_ind]
            g_batch[:len(y[sample_ind]), batch_ind] = gramms[sample_ind]
        yield X_batch, y_batch, g_batch

In [83]:
X_batch, y_batch, g_batch = next(iterate_batches((X_train, y_train, g_train), 4))

g_batch

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [159]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, gramm_size, word_emb_dim=100, lstm_hidden_dim=128, lstm_layers_count=1):
        super().__init__()
        
        self._embs = nn.Embedding(vocab_size, word_emb_dim)
        self._rnn = nn.LSTM(word_emb_dim, lstm_hidden_dim, num_layers=lstm_layers_count, bidirectional=True)
        self._tags = nn.Linear(lstm_layers_count*lstm_hidden_dim * 2, tagset_size)
        self._gramms = nn.Sequential(
            nn.Linear(tagset_size+lstm_layers_count*lstm_hidden_dim * 2, gramm_size),
            nn.Sigmoid()
        )
        

    def forward(self, inputs):
        emb = self._embs(inputs)
        
        output, _ = self._rnn(emb)
        pos = self._tags(output)
        output = torch.cat((pos, output), -1)
        return pos, self._gramms(output)

In [85]:
g_batch

array([[[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [86]:
model = LSTMTagger(
    vocab_size=len(word2ind),
    tagset_size=len(tag2ind),
    gramm_size=len(gram2ind)
).cuda()

X_batch, y_batch, g_batch = LongTensor(X_batch), LongTensor(y_batch), FloatTensor(g_batch)

logits, gr_logits = model(X_batch)

pos_preds = torch.argmax(logits, dim=-1)
mask = (y_batch != 0).float()
pos_correct_count = ((pos_preds == y_batch).float() * mask).sum()
pos_total_score = (pos_correct_count / mask.sum()).item()
print(pos_total_score)

gr_preds = (gr_logits > 0.5).float()
gr_correct_count = ((gr_preds == g_batch).float()).sum()
gr_total_score = (gr_correct_count / (len(gr_preds)*gr_preds.shape[1]*gr_preds.shape[2])).item()
print(gr_total_score)



0.01666666753590107
0.567375898361206


In [87]:
pos_criterion = nn.CrossEntropyLoss()
pos_criterion(logits.view(-1, logits.shape[-1]), y_batch.view(-1))

tensor(2.9223, device='cuda:0', grad_fn=<NllLossBackward>)

In [88]:
gr_criterion = nn.MSELoss()
gr_criterion(gr_logits, g_batch)

tensor(0.2482, device='cuda:0', grad_fn=<MseLossBackward>)

In [113]:
import math
from tqdm import tqdm


def do_epoch(model, criterion, data, batch_size, optimizer=None, name=None):
    epoch_loss = 0
    pos_correct_count = 0
    pos_sum_count = 0
    gr_correct_count = 0
    gr_sum_count = 0
    
    pos_criterion, gr_criterion = criterion
    is_train = not optimizer is None
    name = name or ''
    model.train(is_train)
    
    batches_count = math.ceil(len(data[0]) / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        with tqdm(total=batches_count) as progress_bar:
            for i, (X_batch, y_batch, g_batch) in enumerate(iterate_batches(data, batch_size)):
                X_batch, y_batch, g_batch = LongTensor(X_batch), LongTensor(y_batch), FloatTensor(g_batch)

                logits, gr_logits = model(X_batch)
                
                pos_loss = pos_criterion(logits.view(-1, logits.shape[-1]), y_batch.view(-1))
                gr_loss = gr_criterion(gr_logits, g_batch)
                epoch_loss += gr_loss.item()

                if optimizer:
                    optimizer.zero_grad()
                    pos_loss.backward(retain_graph=True)
                    
                    gr_loss.backward()
                    optimizer.step()
                
                pos_preds = torch.argmax(logits, dim=-1)
                mask = (y_batch != 0).float()
                cur_pos_correct_count = ((pos_preds == y_batch).float() * mask).sum()
                cur_pos_sum = mask.sum().item()
                

                gr_preds = (gr_logits > 0.5).float()
                mask = ~((gr_preds == 0)*(g_batch == 0))
                cur_gr_correct_count = (((gr_preds == g_batch)*mask).float()).sum()
                cur_gr_sum = mask.sum().item()
                
                pos_correct_count += cur_pos_correct_count
                pos_sum_count += cur_pos_sum
                gr_correct_count += cur_gr_correct_count
                gr_sum_count += cur_gr_sum

                progress_bar.update()
                progress_bar.set_description('{:>5s} Loss = {:.5f}, PosAccuracy = {:.2%}, GrAccuracy = {:.2%}'.format(
                    name, pos_loss.item(), cur_pos_correct_count / cur_pos_sum, cur_gr_correct_count/cur_gr_sum)
                )
                
            progress_bar.set_description('{:>5s} Loss = {:.5f}, PosAccuracy = {:.2%}, GrAccuracy = {:.2%}'.format(
                name, epoch_loss / batches_count, pos_correct_count / pos_sum_count, gr_correct_count / gr_sum_count)
            )

    return epoch_loss / batches_count, pos_correct_count / pos_sum_count


def fit(model, criterion, optimizer, train_data, epochs_count=1, batch_size=32,
        val_data=None, val_batch_size=None):
        
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        name_prefix = '[{} / {}] '.format(epoch + 1, epochs_count)
        train_loss, train_acc = do_epoch(model, criterion, train_data, batch_size, optimizer, name_prefix + 'Train:')
        
        if not val_data is None:
            val_loss, val_acc = do_epoch(model, criterion, val_data, val_batch_size, None, name_prefix + '  Val:')

In [160]:
model = LSTMTagger(
    vocab_size=len(word2ind),
    tagset_size=len(tag2ind),
    gramm_size=len(gram2ind)
).cuda()

pos_criterion = nn.CrossEntropyLoss().cuda()
gr_criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

fit(model, (pos_criterion, gr_criterion), optimizer, train_data=(X_train, y_train, g_train), epochs_count=50,
    batch_size=64, val_data=None, val_batch_size=512)

[1 / 50] Train: Loss = 0.02222, PosAccuracy = 67.44%, GrAccuracy = 21.08%: 100%|█████| 753/753 [00:23<00:00, 32.66it/s]
[2 / 50] Train: Loss = 0.01485, PosAccuracy = 85.99%, GrAccuracy = 36.28%: 100%|█████| 753/753 [00:22<00:00, 33.60it/s]
[3 / 50] Train: Loss = 0.01344, PosAccuracy = 91.28%, GrAccuracy = 42.14%: 100%|█████| 753/753 [00:22<00:00, 33.48it/s]
[4 / 50] Train: Loss = 0.01248, PosAccuracy = 94.22%, GrAccuracy = 46.12%: 100%|█████| 753/753 [00:22<00:00, 33.48it/s]
[5 / 50] Train: Loss = 0.01180, PosAccuracy = 96.19%, GrAccuracy = 49.21%: 100%|█████| 753/753 [00:21<00:00, 34.64it/s]
[6 / 50] Train: Loss = 0.01121, PosAccuracy = 97.53%, GrAccuracy = 51.73%: 100%|█████| 753/753 [00:22<00:00, 32.75it/s]
[7 / 50] Train: Loss = 0.01078, PosAccuracy = 98.45%, GrAccuracy = 53.87%: 100%|█████| 753/753 [00:22<00:00, 33.04it/s]
[8 / 50] Train: Loss = 0.01032, PosAccuracy = 99.03%, GrAccuracy = 55.91%: 100%|█████| 753/753 [00:22<00:00, 32.95it/s]
[9 / 50] Train: Loss = 0.00979, PosAccur

In [153]:
def convert_test_data(data, word2ind):
    max_sent_len = max(len(data[ind]) for ind in range(len(data)))
    X = np.zeros((max_sent_len, len(data)))
    for i in range(len(data)):
        X[:len(data[i]), i] = [word2ind.get(word, 0) for word in data[i]]
    return X
    
X_test = convert_test_data(test, word2ind)

In [154]:
X_test.shape

(138, 12380)

In [155]:
ind2tag = {tag2ind[key]:key for key in tag2ind}
ind2tag[0] = '_'
ind2gram = {gram2ind[key]:key for key in gram2ind}
ind2gram[0] = ('_','')

In [162]:
def get_tag(pos, gr, ind2tag, ind2gram):
    pos_str = ind2tag[pos]
    if pos_str == '_':
        return '_'
    grams = []
    for i, ind in enumerate(gr):
        if ind != 0:
            if ind2gram[i][0] == '_':
                break
            tmp = '='.join([ind2gram[i][0], ind2gram[i][1]])
            grams.append(tmp)
    if len(grams) == 0:
        gr_str = '_'
    else:
        grams = np.sort(grams)
        gr_str = '|'.join(grams)
    return pos_str + '#' + gr_str

In [163]:
with open("submission.csv", "w") as f:
    f.write("Id,Prediction\n")
    index = 0
    size = X_test.shape[1]
    offset = 0
    while(offset < size):
        part = np.min([50, size-offset])
        
        test_input = LongTensor(X_test[:, offset:offset+part])

        model.eval()
        logits, gr_logits = model(test_input)

        pos_preds = torch.argmax(logits, dim=-1)
        gr_preds = (gr_logits > 0.5).float()
        for i in range(part):
            for j in range(len(test[offset + i])):
                f.write("%d,%s\n" % (index, get_tag(pos_preds[j, i].item(), gr_preds[j, i].cpu().detach().numpy(), ind2tag, ind2gram)))
                index += 1
        offset += part    