In [1]:
# Имена файлов с данными.
TRAIN_FILENAME = "train.csv"
TEST_FILENAME = "test.csv"

In [2]:
# Считывание файлов.
from collections import namedtuple
WordForm = namedtuple("WordForm", "word pos gram")

def get_sentences(filename, is_train):
    sentences = []
    with open(filename, "r", encoding='utf-8') as r:
        sentence = []
        for line in r:
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                gram = line.strip().split("\t")[3].split("#")[1]
                sentence.append(WordForm(word, pos, gram))
            else:
                word = line.strip().split("\t")[2]
                sentence.append(word)
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

In [3]:
train = get_sentences(TRAIN_FILENAME, True)
test = get_sentences(TEST_FILENAME, False)

In [4]:
# Класс для удобной векторизации грамматических значений.
import jsonpickle
import os
from collections import defaultdict
from typing import Dict, List, Set

def process_gram_tag(gram: str):
    gram = gram.strip().split("|")
    return "|".join(sorted(gram))


def get_empty_category():
    return {GrammemeVectorizer.UNKNOWN_VALUE}


class GrammemeVectorizer(object):
    UNKNOWN_VALUE = "Unknown"

    def __init__(self, dump_filename: str):
        self.all_grammemes = defaultdict(get_empty_category)  # type: Dict[str, Set]
        self.vectors = []  # type: List[List[int]]
        self.name_to_index = {}  # type: Dict[str, int]
        self.dump_filename = dump_filename  # type: str
        if os.path.exists(self.dump_filename):
            self.load()

    def add_grammemes(self, pos_tag: str, gram: str) -> int:
        gram = process_gram_tag(gram)
        vector_name = pos_tag + '#' + gram
        if vector_name not in self.name_to_index:
            self.name_to_index[vector_name] = len(self.name_to_index)
            self.all_grammemes["POS"].add(pos_tag)
            gram = gram.split("|") if gram != "_" else []
            for grammeme in gram:
                category = grammeme.split("=")[0]
                value = grammeme.split("=")[1]
                self.all_grammemes[category].add(value)
        return self.name_to_index[vector_name]

    def init_possible_vectors(self) -> None:
        self.vectors = []
        for grammar_val, index in sorted(self.name_to_index.items(), key=lambda x: x[1]):
            pos_tag, grammemes = grammar_val.split('#')
            grammemes = grammemes.split("|") if grammemes != "_" else []
            vector = self.__build_vector(pos_tag, grammemes)
            self.vectors.append(vector)

    def get_vector(self, vector_name: str) -> List[int]:
        if vector_name not in self.name_to_index:
            return [0] * len(self.vectors[0])
        return self.vectors[self.name_to_index[vector_name]]

    def get_vector_by_index(self, index: int) -> List[int]:
        return self.vectors[index] if 0 <= index < len(self.vectors) else [0] * len(self.vectors[0])

    def get_ordered_grammemes(self) -> List[str]:
        flat = []
        sorted_grammemes = sorted(self.all_grammemes.items(), key=lambda x: x[0])
        for category, values in sorted_grammemes:
            for value in sorted(list(values)):
                flat.append(category+"="+value)
        return flat
    
    def save(self) -> None:
        with open(self.dump_filename, "w") as f:
            f.write(jsonpickle.encode(self, f))

    def load(self):
        with open(self.dump_filename, "r") as f:
            vectorizer = jsonpickle.decode(f.read())
            self.__dict__.update(vectorizer.__dict__)

    def size(self) -> int:
        return len(self.vectors)

    def grammemes_count(self) -> int:
        return len(self.get_ordered_grammemes())

    def is_empty(self) -> int:
        return len(self.vectors) == 0

    def get_name_by_index(self, index):
        d = {index: name for name, index in self.name_to_index.items()}
        return d[index]

    def get_index_by_name(self, name):
        pos = name.split("#")[0]
        gram = process_gram_tag(name.split("#")[1])
        return self.name_to_index[pos + "#" + gram]

    def __build_vector(self, pos_tag: str, grammemes: List[str]) -> List[int]:
        vector = []
        gram_tags = {pair.split("=")[0]: pair.split("=")[1] for pair in grammemes}
        gram_tags["POS"] = pos_tag
        sorted_grammemes = sorted(self.all_grammemes.items(), key=lambda x: x[0])
        for category, values in sorted_grammemes:
            if category not in gram_tags:
                vector += [1 if value == GrammemeVectorizer.UNKNOWN_VALUE else 0 for value in sorted(list(values))]
            else:
                vector += [1 if value == gram_tags[category] else 0 for value in sorted(list(values))]
        return vector

In [5]:
from pymorphy2 import MorphAnalyzer
from russian_tagsets import converters

morph = MorphAnalyzer()
to_ud = converters.converter('opencorpora-int', 'ud14')

def convert_from_opencorpora_tag(tag, text):
    ud_tag = to_ud(str(tag), text)
    pos = ud_tag.split()[0]
    gram = ud_tag.split()[1]
    return pos, gram

def fill_all_variants(word, vectorizer):
    for parse in morph.parse(word):
        pos, gram = convert_from_opencorpora_tag(parse.tag, parse.word)
        gram = process_gram_tag(gram)
        vectorizer.add_grammemes(pos, gram)

vectorizer = GrammemeVectorizer("vectorizer.json")
if vectorizer.is_empty():
    for sentence in train:
        for form in sentence:
            fill_all_variants(form.word, vectorizer) 
    for sentence in test:
        for word in sentence:
            fill_all_variants(word, vectorizer)
    vectorizer.init_possible_vectors()
    vectorizer.save()

In [23]:
vectorizer_output = GrammemeVectorizer("vectorizer_output.json")
if vectorizer_output.is_empty():
    for sentence in train:
        for form in sentence:
            gram = process_gram_tag(form.gram)
            vectorizer_output.add_grammemes(form.pos, gram)
    vectorizer_output.init_possible_vectors()
    vectorizer_output.save()

In [6]:
# Получение признаков для конкретного контекста.
def get_context_features(i, parse_sentence, context_len):
    sample = []
    left = i-(context_len-1)//2
    right = i+context_len//2
    if left < 0:
        for i in range(-left):
            sample += [0 for i in range(vectorizer.grammemes_count())]
    for parse in parse_sentence[max(left, 0): min(right+1, len(sentence))]:
        word = parse.word
        pos, gram = convert_from_opencorpora_tag(parse.tag, parse.word)
        gram = process_gram_tag(gram)
        sample += vectorizer.get_vector(pos+"#"+gram)
    if right > len(sentence)-1:
        for i in range(right-len(sentence)+1):
            sample += [0 for i in range(vectorizer.grammemes_count())]
    assert len(sample) == context_len * vectorizer.grammemes_count()
    return sample

In [7]:
# Загрузка обучающей выборки.
import numpy as np
import os

TRAIN_SAMPLES_PATH = "samples.npy"
ANSWERS_PATH = "answers.npy"
if not os.path.exists(TRAIN_SAMPLES_PATH) or not os.path.exists(ANSWERS_PATH):
    context_len = 5
    n = sum([1 for sentence in train for word in sentence])
    samples = np.zeros((n, context_len*vectorizer.grammemes_count()), dtype='bool_')
    answers = np.zeros((n, ), dtype='int')
    index = 0
    for sentence in train:
        parse_sentence = [morph.parse(form.word)[0] for form in sentence]
        for i, form in enumerate(sentence):
            samples[index] = get_context_features(i, parse_sentence , context_len)
            gram = process_gram_tag(form.gram)
            answers[index] = vectorizer_output.get_index_by_name(form.pos+"#"+gram)
            index += 1
            if index % 100000 == 0:
                print(index)
    np.save(TRAIN_SAMPLES_PATH, samples)
    np.save(ANSWERS_PATH, answers)
else:
    samples = np.load(TRAIN_SAMPLES_PATH)
    answers = np.load(ANSWERS_PATH)
print(samples[0], answers[0])

[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False  True False False  True False False
 False False False False  True False False  True False False False  True
 False False  True False  True False False False  True False False False
  True False False False False False False False Fa

In [None]:
np.shape(samples)

In [None]:
np.max(answers)

In [8]:
context_len = 5

In [9]:
# Загрузка тестовой выборки
TEST_SAMPLES_PATH = "test_samples.npy"
ANSWERS_PATH = "answers.npy"
if not os.path.exists(TEST_SAMPLES_PATH):
    n = sum([1 for sentence in test for word in sentence])
    test_samples = np.zeros((n, context_len*vectorizer.grammemes_count()), dtype='bool_')
    index = 0
    for i, sentence in enumerate(test):
        parse_sentence = [morph.parse(word)[0] for word in sentence]
        for i, word in enumerate(sentence):
            test_samples[index] = get_context_features(i, parse_sentence, context_len)
            index += 1
    np.save(TEST_SAMPLES_PATH, test_samples)
else:
    test_samples = np.load(TEST_SAMPLES_PATH)

In [10]:
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Activation

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GT 740M (CNMeM is enabled with initial size: 60.0% of memory, cuDNN 5110)


In [50]:
model = Sequential([
    Dense(64, input_shape=(310,)),
    Activation('relu'),
    Dense(581),
    Activation('softmax'),
])

In [51]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 64)                19904     
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 581)               37765     
_________________________________________________________________
activation_4 (Activation)    (None, 581)               0         
Total params: 57,669.0
Trainable params: 57,669.0
Non-trainable params: 0.0
_________________________________________________________________


In [52]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy')

In [None]:
model.train_on_batch(samples[:100], answers[:100])

In [None]:
ans = model.predict(samples[:100])

In [62]:
def shuffle_butch(dataX, dataY):
    indexes = np.array(range(len(dataX)))
    np.random.shuffle(indexes)
    return dataX[indexes], dataY[indexes]

In [65]:
num_of_epoches = 25
parts = [0,50000,100000,150000,200000,250000,300000,350000,400000,450000,500000,550000,600000,650000,700000,750000,800000,850688]
for epoch in range(num_of_epoches):
    print("epoch: ", epoch)
    for i in range(len(parts) - 1):
#         print("parts: ",parts[i],' - ', parts[i+1])
        X, Y = shuffle_butch(samples[parts[i]:parts[i+1]], answers[parts[i]:parts[i+1]])
        model.train_on_batch(X, Y )

epoch:  0
epoch:  1
epoch:  2
epoch:  3
epoch:  4
epoch:  5
epoch:  6
epoch:  7
epoch:  8
epoch:  9
epoch:  10
epoch:  11
epoch:  12
epoch:  13
epoch:  14
epoch:  15
epoch:  16
epoch:  17
epoch:  18
epoch:  19
epoch:  20
epoch:  21
epoch:  22
epoch:  23
epoch:  24


In [15]:
np.shape(test_samples)

(217794, 310)

In [66]:
answer = model.predict(test_samples)

In [19]:
np.argmax(answer[0])

5

In [67]:
answer_indexes = []
for answer_part in answer:
    answer_indexes.append(np.argmax(answer_part))

In [21]:
answer_indexes[:10]

[5, 5, 2, 88, 9, 9, 22, 2, 11, 11]

In [None]:
# Обучение классификатора.
X = samples[:200000]
y = answers[:200000]
clf.fit(X, y)

In [None]:
# Предсказания.
answers = []
batch_size = 1000
n_batches = len(test_samples)//batch_size
for i in range(n_batches):
    answers += list(clf.predict(test_samples[i*batch_size: i*batch_size+batch_size]))
answers += list(clf.predict(test_samples[n_batches*batch_size:]))

In [68]:
# Сохранение посылки
with open("subm.csv", "w") as f: 
    f.write("Id,Prediction\n")
    for index, answer in enumerate(answer_indexes):
        f.write(str(index) + "," + vectorizer_output.get_name_by_index(answer) + "\n")