In [1]:
import gensim
import pymorphy2
import os
import time
import numpy as np
import tensorflow as tf
import unidecode
from keras_preprocessing.text import Tokenizer
 
tf.enable_eager_execution()

In [2]:
#Now we load 
model = gensim.models.KeyedVectors.load_word2vec_format("G:\\New folder\\ruwikiruscorpora_tokens_elmo_1024_2019\\ruwikiruscorpora_upos_skipgram_300_2_2019\\model.bin", binary=True)
model.init_sims(replace=True)

morph = pymorphy2.MorphAnalyzer()

In [3]:
morph.parse('холодный')

[Parse(word='холодный', tag=OpencorporaTag('ADJF,Qual masc,sing,nomn'), normal_form='холодный', score=0.5, methods_stack=((<DictionaryAnalyzer>, 'холодный', 86, 0),)),
 Parse(word='холодный', tag=OpencorporaTag('ADJF,Qual inan,masc,sing,accs'), normal_form='холодный', score=0.5, methods_stack=((<DictionaryAnalyzer>, 'холодный', 86, 4),))]

In [4]:
model.most_similar(['ледяной_ADJ'], topn=20)

[('лед_NOUN', 0.7199968099594116),
 ('снежный_ADJ', 0.6809204816818237),
 ('снег_NOUN', 0.6226842403411865),
 ('холодный_ADJ', 0.6221423745155334),
 ('льдистый_ADJ', 0.6192911863327026),
 ('льдинка_NOUN', 0.6153559684753418),
 ('сугроб_NOUN', 0.5984904766082764),
 ('холод_NOUN', 0.594094455242157),
 ('оледенелый_ADJ', 0.5915398001670837),
 ('сосулька_NOUN', 0.5900148749351501),
 ('торос_NOUN', 0.5898441076278687),
 ('замерзнуть_VERB', 0.5872437953948975),
 ('иней_NOUN', 0.5848214626312256),
 ('глыба_NOUN', 0.5659131407737732),
 ('замерзать_VERB', 0.5654045939445496),
 ('морозный_ADJ', 0.5636944770812988),
 ('обледенелый_ADJ', 0.563029408454895),
 ('студеный_ADJ', 0.5602612495422363),
 ('снеговой_ADJ', 0.5562052130699158),
 ('полынья_NOUN', 0.5557663440704346)]

In [5]:
def search_neighbour(word, pos, gend='masc'):
    word = word.replace('ё', 'е')
    lex = word + '_' + cotags[pos]
    if lex in model:
        neighbs = model.most_similar([lex], topn=20)
        for nei in neighbs:
            lex_n, ps_n = nei[0].split('_')
            if '::' in lex_n:
                continue
            if cotags[pos] == ps_n:
                if pos == 'NOUN':
                    parse_result = morph.parse(lex_n)
                    for ana in parse_result:
                        if ana.normal_form == lex_n:
                            if ana.tag.gender == gend:
                                return lex_n
                elif cotags[pos] == 'VERB' and word[-2:] == 'ся':
                    if lex_n[-2:] == 'ся':
                        return lex_n
                elif cotags[pos] == 'VERB' and word[-2:] != 'ся':
                    if lex_n[-2:] != 'ся':
                        return lex_n
                else:
                    return lex_n
    return None

In [6]:
def flection(lex_neighb, tags):
    tags = str(tags)
    tags = re.sub(',[AGQSPMa-z-]+? ', ',', tags)
    tags = tags.replace("impf,", "")
    tags = re.sub('([A-Z]) (plur|masc|femn|neut|inan)', '\\1,\\2', tags)
    tags = tags.replace("Impe neut", "")
    tags = tags.split(',')
    tags_clean = []
    for t in tags:
        if t:
            if ' ' in t:
                t1, t2 = t.split(' ')
                t = t2
            tags_clean.append(t)
    tags = frozenset(tags_clean)
    prep_for_gen = morph.parse(lex_neighb)
    ana_array = []
    for ana in prep_for_gen:
        if ana.normal_form == lex_neighb:
            ana_array.append(ana)
    for ana in ana_array:
        try:
            flect = ana.inflect(tags)
        except:
            print(tags)
            return None
        if flect:
            word_to_replace = flect.word
            return word_to_replace
    return None

In [7]:
cotags = {
    'ADJF':'ADJ', # pymorphy2: word2vec 
    'ADJS' : 'ADJ', 
    'ADVB' : 'ADV', 
    'COMP' : 'ADV', 
    'GRND' : 'VERB', 
    'INFN' : 'VERB', 
    'NOUN' : 'NOUN', 
    'PRED' : 'ADV', 
    'PRTF' : 'ADJ', 
    'PRTS' : 'VERB', 
    'VERB' : 'VERB'
}

In [8]:
file_path = "G:\\New folder\\month-2011-12-qtraf_small"
 
text = open(file_path).read()
#print(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
 
encoded = tokenizer.texts_to_sequences([text])[0]
#print(encoded) 
vocab_size = len(tokenizer.word_index) + 1
#print("vocab_size %s" %(vocab_size))
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
#print("word_index %s" %( tokenizer.word_index))
#print("index_word %s" %( tokenizer.index_word))

sequences = list()

for i in range(1, len(encoded)):
    sequence = encoded[i - 1:i + 1]
    sequences.append(sequence)
sequences = np.array(sequences)
#print(word2idx)
X, Y = sequences[:, 0], sequences[:, 1]
X = np.expand_dims(X, 1)
Y = np.expand_dims(Y, 1)

In [14]:
#This function returns only similar words that contains in train dataset
def sortSimilarListByDataset(words_list):
    ret_list = []
    for word in words_list:
        try:
            if word2idx[word]:
                ret_list.append(word)
        except KeyError:
            continue
    return ret_list
#Returns Top N words, that similars with
def getSimilarsForWord(word, top=10):
    parsed = morph.parse(word)
    pos = cotags[parsed[0].tag.POS]
    gensim_find_word = word + "_" + pos
    most_similars = model.most_similar([gensim_find_word], topn=top)
    return_list = []
    for sim in most_similars:
        sim_parsed = sim[0].split("_")
        if sim_parsed[1] == pos:
            return_list.append(sim_parsed[0])
    return return_list
my_word = "линия"
#find similars
sim_list = getSimilarsForWord(my_word, 40)
print(sim_list)
dataset_list = sortSimilarListByDataset(sim_list)
print(dataset_list)

['линя', 'контур', 'электропередача', 'направление', 'зигзаг', 'полоса', 'пунктир', 'вертикаль', 'горизонталя', 'плоскость', 'точка']
['направление', 'полоса', 'плоскость', 'точка']


In [10]:
BUFFER_SIZE = 100
BATCH_SIZE = 100
dataset = tf.data.Dataset.from_tensor_slices((X, Y)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
embedding_dim = 100
units = 2048

In [11]:
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Model, self).__init__()
        self.units = units
        self.batch_size = batch_size
 
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
 
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
 
    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        #print(inputs)
        output, states = self.gru(inputs, initial_state=hidden)
 
        output = tf.reshape(output, (-1, output.shape[2]))
 
        x = self.fc(output)
 
        return x, states

keras_model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)

optimizer = tf.train.AdamOptimizer()
 
#checkpoint_dir = '.\\training_checkpoints_wordstat'
checkpoint_dir = '.\\training_checkpoints_wordstat_small2048'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=keras_model)

def loss_function(labels, logits):
    return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

EPOCHS = 10
# for epoch in range(EPOCHS):
#     start = time.time()
 
#     hidden = keras_model.reset_states()
 
#     for (batch, (input, target)) in enumerate(dataset):
#         with tf.GradientTape() as tape:
#             predictions, hidden = model(input, hidden)
 
#             target = tf.reshape(target, (-1,))
#             loss = loss_function(target, predictions)
 
#             grads = tape.gradient(loss, keras_model.variables)
#             optimizer.apply_gradients(zip(grads, keras_model.variables))
 
#             if batch % 100 == 0:
#                 print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, loss))
 
#     if (epoch + 1) % 3 == 0:
#         checkpoint.save(file_prefix=checkpoint_prefix)

In [21]:
checkpoint_dir = '.\\training_checkpoints_wordstat_small2048'
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
current_word = "мишка"
input_eval = [word2idx[current_word]]
input_eval = tf.expand_dims(input_eval, 0)

print("UNITS: %s" %(units))
hidden = [tf.zeros((1, units))]

#Now we find similars for start word
similar_words = getSimilarsForWord(current_word, 10)
similar_words.append(current_word)
dataset_words_list = sortSimilarListByDataset(similar_words)
print("dataset_words_list %s" %(dataset_words_list))

sequences_lists = [[word] for word in dataset_words_list]
print(sequences_lists)
for sequence in sequences_lists:
    for i in range(4):
        input_eval = [word2idx[sequence[i]]]
        input_eval = tf.expand_dims(input_eval, 0)    

        predictions, hidden = keras_model(input_eval, hidden)
#         print("PREDICTIONS")
#         print(predictions)

        predicted_id = tf.argmax(predictions[-1]).numpy()

        sequence.append(idx2word[predicted_id])
        
for sequence in sequences_lists:
    print(" ".join(sequence))

UNITS: 2048
dataset_words_list ['медвежонок', 'мишка']
[['медвежонок'], ['мишка']]
медвежонок колки шва бабышек газового
мишка окл информатики пиона секущей
