In [2]:
# для совместимости со вторым питоном
from __future__ import print_function
import io

In [3]:
# Имена файлов с данными.
TRAIN_FILENAME = "data/input/train.csv"
TEST_FILENAME = "data/input/test.csv"

In [4]:
# Считывание файлов.
from collections import namedtuple
WordForm = namedtuple("WordForm", "word pos gram")

def get_sentences(filename, is_train):
    sentences = []
    with io.open(filename, "r", encoding='utf-8') as r:
        # Пропускаем заголовок
        next(r)
        sentence = [] # будем заполнять список предложений
        for line in r:
            # предложения отделены по '\n'
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                # Формат: индекс\tномер_в_предложении\tсловоформа\tPOS#Грамемы
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                gram = line.strip().split("\t")[3].split("#")[1]
                sentence.append(WordForm(word, pos, gram))
            else:
                word = line.strip().split("\t")[2]
                sentence.append(word)
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

In [5]:
train = get_sentences(TRAIN_FILENAME, True)
test = get_sentences(TEST_FILENAME, False)

In [6]:
# Выыедем, что получилось
for wordform in train[0][:10]:
    print(wordform.word, '\t', wordform.pos, '\t', wordform.gram)

А 	 CONJ 	 _
ведь 	 PART 	 _
для 	 ADP 	 _
конкретных 	 ADJ 	 Case=Gen|Degree=Pos|Number=Plur
изделий 	 NOUN 	 Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur
зачастую 	 ADV 	 Degree=Pos
нужен 	 ADJ 	 Degree=Pos|Gender=Masc|Number=Sing|Variant=Brev
монокристалл 	 NOUN 	 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
не 	 PART 	 _
только 	 PART 	 _


In [7]:
#запомним все уникальные слова и POS-теги в корпусе
word_set = set()
pos_set = set()
for sent in train:
    for wordform in sent:
        word_set.add(wordform.word.lower())
        pos_set.add(wordform.pos)

In [8]:
for word in list(word_set)[:10]: 
    print(word, end=', ')
print()
print(pos_set)

полюдье, ревнителями, лексусовским, увеличились, сравнялось, накопил, сбивается, бабами, томографы, освобожденные, 
{'NOUN', 'ADV', 'PART', 'SYM', 'PROPN', 'ADJ', 'NUM', 'SCONJ', 'CONJ', 'PRON', 'PUNCT', 'DET', 'AUX', 'X', 'VERB', 'INTJ', 'ADP'}


In [9]:
#Загрузите эмбеддинги c https://nlp.stanford.edu/projects/glove/ или другие, которые вам нравятся и пропишите путь к ним
import numpy as np

word_embeddings_path = 'data\glove.6B.50d.txt'
word2idx = {}
word_embeddings = []
embedding_size = None
#Загружаем эмбеддинги
with io.open(word_embeddings_path, 'r', encoding="utf-8") as f_em:
    for line in f_em:
        split = line.strip().split(" ")
        # Совсем короткие строки пропускаем
        if len(split) <= 2:
            continue
        # Встретив первую подходящую строку, фиксируем размер эмбеддингов
        if embedding_size is None:
            embedding_size = len(split) - 1
            # Также нициализируем эмбеддинги для паддингов и неизвестных слов
            word2idx["PADDING_TOKEN"] = len(word2idx)
            word_embeddings.append(np.zeros(embedding_size))

            word2idx["UNKNOWN_TOKEN"] = len(word2idx)
            word_embeddings.append(np.random.uniform(-0.25, 0.25, embedding_size))
        # После этого все эмбеддинги должны быть одинаковой длины
        if len(split) - 1 != embedding_size:
            continue
            
        #Если слова нет в корпусе, то не будем для него запоминать эмбеддинг        
        if (split[0] not in word_set):
            continue
        
        word_embeddings.append(np.asarray(split[1:], dtype='float32'))
        word2idx[split[0]] = len(word2idx)

word_embeddings = np.array(word_embeddings, dtype='float32')

In [10]:
len(word_set & set(word2idx.keys()))

1948

In [11]:
len(word_set)

98880

In [12]:
word_to_index = {'PAD' : 0, 'UNK' : 1}
for word in word_set:
    word_to_index[word] = len(word_to_index)

In [13]:
len(word_to_index)

98882

In [14]:
pos_to_index = {"PAD": 0}
index_to_pos = {0: "PAD"}
for pos in pos_set:
    pos_to_index[pos] = len(pos_to_index)
    index_to_pos[len(index_to_pos)] = pos

In [15]:
# для полносвязной сетки просто захреначим все в один список
data_X = []
data_Y = []
for sent in train:
    for wordform in sent:
        data_X.append(word_to_index[wordform.word.lower()])
        data_Y.append(pos_to_index[wordform.pos])

In [16]:
X = []
y = []
for sent in train:
    X.append([word_to_index[w.word.lower()] for w in sent])
    y.append([pos_to_index[w.pos] for w in sent])
X = np.array(X)
y = np.array(y)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1000 / len(data_X))

In [18]:
lens = np.array([len(x) for x in X_train])
s_inds = np.argsort(lens)
# sorting X by lengths
X_train = X_train[s_inds]
y_train = y_train[s_inds]

In [50]:
def get_batch(X, y, batch_size):
    n = np.random.randint(len(X) - batch_size)
    inds = np.arange(n, n + batch_size)
    np.random.shuffle(inds)
    X_sent = X[inds]
    y_sent = y[inds]
    max_l = np.max([len(x) for x in X_sent])
    X_batch = np.array([x + [word_to_index["PAD"]] * (max_l - len(x)) for x in X_sent])
    tags = np.array([x + [pos_to_index["PAD"]] * (max_l - len(x)) for x in y_sent])
    
    y_batch = np.zeros((batch_size, X_batch.shape[1], len(pos_to_index)))
    for j, row in enumerate(tags):
        for k, pos in enumerate(row):
            y_batch[j, k, pos] = 1
    return X_batch, y_batch

In [52]:
a, b = get_batch(X_train, y_train, 10)
b.shape

(10, 32, 18)

# Модель

In [69]:
import keras.layers as L
from keras.models import Sequential

In [70]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [71]:
model = Sequential([
    L.InputLayer(input_shape=(None,)),
    L.Embedding(len(word_to_index), 50, trainable=False),
    L.Bidirectional(L.LSTM(100, return_sequences=True)),
    L.TimeDistributed(L.Dense(len(pos_to_index))),
    L.Activation('softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy', metrics=['accuracy', ignore_class_accuracy(0)])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, None, 50)          4944100   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, None, 200)         120800    
_________________________________________________________________
time_distributed_16 (TimeDis (None, None, 18)          3618      
_________________________________________________________________
activation_9 (Activation)    (None, None, 18)          0         
Total params: 5,068,518
Trainable params: 124,418
Non-trainable params: 4,944,100
_________________________________________________________________


In [76]:
batch_size = 64

for i in range(200):
    features, targets = get_batch(X, y, batch_size)
    if i % 50 == 0:
        print(f"\nbatch #{i}, sent length {features.shape[1]}")
    else:
        print(end=".")
    model.train_on_batch(features, targets)


batch #0, sent length 51
.................................................
batch #50, sent length 46
.................................................
batch #100, sent length 56
.................................................
batch #150, sent length 50
.................................................

In [77]:
eva_X, eva_y = get_batch(X_test, y_test, len(X_test) - 1)
print(eva_X.shape, eva_y.shape)
len(X_test)

(56, 49) (56, 49, 18)


57

In [78]:
ev = model.evaluate(eva_X, eva_y)



In [79]:
for name, val in zip(model.metrics_names, ev):
    print(name, val)

loss 0.7737555844443185
acc 0.7656705634934562
ignore_accuracy 0.37185587879676113
