In [1]:
import matplotlib.pyplot as plt

import re, os, time, random
import pandas as pd
import numpy as np

import tensorflow as tf
import pickle as pkl

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json
from keras.models import Model, load_model
from keras.layers import LSTM, GRU, Input, Dense, Embedding
from keras.preprocessing.sequence import pad_sequences

In [2]:
SEED = 1337

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# 1. Подготовка даных

In [3]:
def preprocess_sentence(w, punctuation=False, toSeq=False):
    """
        Функция для предобработки 
    """
    
    # Уменьшаем регистр и убираем лишние пробелы
    w = w.lower().strip()
    
    # Замена всех символов 'æ' на однотипный
    w = re.sub(r"ӕ", r"æ", w)
    
    # Удаление апострофом
    w = re.sub("'", '', w)
    
    if punctuation:
        # Делаем между словом и знаком пунктуации отступ 'слово! -> слово !'
        w = re.sub(r"([?.!,])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)
    else:
        # Удаляет все знаки пунктуации
        w = re.sub(r"[^\w\s]", r"", w)
    
    # Выкидываем все остальные символы из рассмотрения 
    w = re.sub(r"[^a-яА-Яa-zA-Z?.!,æё]+", " ", w)
    w = w.rstrip().strip()
    
    # Добавляем токены для начала и конца предложения
    if toSeq:
        w = f'<sos> {w} <eos>'
        
    return w

In [4]:
def load_dataset(path):
    """
        Функция, которая создаёт датасет
    """    
    new_path = f'Date/{path}'
    with open(new_path, encoding='utf-8') as f:
        data = f.read()
    
    uncleaned_data_list = data.split('\n')
    
    source_word = []
    target_word = []
    for word in uncleaned_data_list:
        source_word.append(preprocess_sentence(word.split('\t')[0], punctuation=False, toSeq=False))
        target_word.append(preprocess_sentence(word.split('\t')[1], punctuation=False, toSeq=True))
        
    language_data = pd.DataFrame(columns=['Source','Target'])
    language_data['Source'] = source_word
    language_data['Target'] = target_word
    
    return language_data

In [5]:
def tokenizer_(text_data):
    '''
        Токенайзер
    '''
    
    tokenizer = Tokenizer(filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(text_data)
    return tokenizer

In [6]:
def max_length(data):
    max_length_ = max([len(x.split(' ')) for x in data])
    return max_length_

In [7]:
def get_preparing_data(input_seq, output_seq):
    tokenizer_input, tokenizer_output = tokenizer_(input_seq), tokenizer_(output_seq)
    input_max_length, output_max_length = len(tokenizer_input.word_index) + 1, len(tokenizer_output.word_index) + 1
    
    return tokenizer_input, tokenizer_output, input_max_length, output_max_length

# 2. Построение модели Encoder-Decoder

In [16]:
class Encoder(tf.keras.Model):
    """
        Энкодер 
    """
    
    def __init__(self, vocab_size_input, HIDDEN_DIM):
        super(Encoder, self).__init__()
        
        self.inputs = Input(shape=(None,), name="encoder_inputs")
        self.embedding = Embedding(vocab_size_input, HIDDEN_DIM, mask_zero=True, name="encoder_embedding")(self.inputs)
        
        encoder = GRU(HIDDEN_DIM, return_state=True, name="encoder_gru")
        self.outputs, state_h = encoder(self.embedding)
        self.states = [state_h]
        
def getEncoder(model_loaded):
    encoder_inputs_inf = model_loaded.input[0]
    encoder_outputs_inf, inf_state_h = model_loaded.layers[4].output
    encoder_inf_states = [inf_state_h]

    return Model(encoder_inputs_inf,encoder_inf_states, name='Encoder')

In [17]:
class Decoder(tf.keras.Model):
    """
        Декодер 
    """
    
    def __init__(self, vocab_size_output, HIDDEN_DIM, encoder_states):
        super(Decoder, self).__init__()
        
        self.inputs = Input(shape=(None,), name="decoder_inputs")
        self.embedding = Embedding(vocab_size_output, HIDDEN_DIM, mask_zero=True, name="decoder_embedding")(self.inputs)
        
        decoder = GRU(HIDDEN_DIM, return_sequences=True, return_state=True, name="decoder_gru")
        self.outputs, _ = decoder(self.embedding, initial_state=encoder_states)
        self.dense = Dense(vocab_size_output, activation='softmax', name="dense_gru")
        self.outputs = self.dense(self.outputs)
        
def getDecoder(model_loaded):
    decoder_state_h_input = Input(shape=(HIDDEN_DIM,))
    decoder_state_input = [decoder_state_h_input]

    decoder_input_inf = model_loaded.input[1]
    decoder_emb_inf = model_loaded.layers[3](decoder_input_inf)
    decoder_gru_inf = model_loaded.layers[5]
    decoder_output_inf, decoder_state_h_inf = decoder_gru_inf(decoder_emb_inf, initial_state=decoder_state_input)
    decoder_state_inf = [decoder_state_h_inf]
    dense_inf = model_loaded.layers[6]
    decoder_output_final = dense_inf(decoder_output_inf)

    return Model([decoder_input_inf]+decoder_state_input,[decoder_output_final]+decoder_state_inf, name='Decoder')

In [18]:
def Batch(X, Y, batch_size):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_data_input = np.zeros((batch_size,max_lenght_source),dtype='float32') #metrix of batch_size*max_length_english
            decoder_data_input = np.zeros((batch_size,max_lenght_target),dtype='float32') #metrix of batch_size*max_length_marathi
            decoder_target_input = np.zeros((batch_size,max_lenght_target,vocab_size_target),dtype='float32') # 3d array one hot encoder decoder target data
            for i, (input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_data_input[i,t] = tokenizer_input.word_index[word] # Here we are storing the encoder 
                                                                         #seq in row here padding is done automaticaly as 
                                                                         #we have defined col as max_lenght
                for t, word in enumerate(target_text.split()):
                    decoder_data_input[i,t] = tokenizer_output.word_index[word] # same for the decoder sequence
                    if t>0:
                        decoder_target_input[i,t-1,tokenizer_output.word_index[word]] = 1 #target is one timestep ahead of decoder input because it does not have 'start tag'
            yield ([encoder_data_input,decoder_data_input],decoder_target_input)

# 3. Входные параметры

In [19]:
HIDDEN_DIM = 50
batch_size = 6
epochs = 20

start_target = "<sos>"
end_target = "<eos>"

In [20]:
path = 'rus-oss.txt'
data = load_dataset(path)
input_seq, output_seq = data['Source'].values, data['Target'].values
tokenizer_input, tokenizer_output, vocab_size_source, vocab_size_target = get_preparing_data(input_seq, output_seq)

In [21]:
with open(f'{path[:-4]}-tokenizer_input.pkl','wb') as f:
    pkl.dump(tokenizer_input, f)

with open(f'{path[:-4]}-tokenizer_output.pkl','wb') as f:
    pkl.dump(tokenizer_output, f)

pkl.dump(tokenizer_input, open(f'{path[:-4]}-tokenizer_input.pkl', 'wb'))
pkl.dump(tokenizer_output, open(f'{path[:-4]}-tokenizer_output.pkl', 'wb'))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(input_seq, output_seq, test_size = 0.1)
train_samples = len(X_train)
test_samples = len(X_test)

max_lenght_source = max_length(X_train)
max_lenght_target = max_length(y_train)

In [23]:
encoder = Encoder(vocab_size_source, HIDDEN_DIM)
decoder = Decoder(vocab_size_target, HIDDEN_DIM, encoder.states)

model = Model([encoder.inputs, decoder.inputs], decoder.outputs, name="GRU-Translation")

In [24]:
loss_function = [
    'categorical_crossentropy',
    'binary_crossentropy',
]

now_loss_function = loss_function[0]
model.compile(loss=now_loss_function, optimizer='rmsprop', metrics=['accuracy'])

In [25]:
model.summary()

Model: "GRU-Translation"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, None, 50)     47250       ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embedding (Embedding)  (None, None, 50)     46100       ['decoder_inputs[0][0]']         
                                                                                    

# 4. Обучение модели

In [26]:
# %%capture
model.fit_generator(generator = Batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)

Epoch 1/20


  model.fit_generator(generator = Batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x22089176970>

# 5. Сохранение модели

In [19]:
# Отрисовка схемы модели
# plot_model(model, to_file=f'{dir}-{epochs}-train_model.png', show_shapes=True)

In [27]:
def model_save_JSON():
    model_json = model.to_json()
    
    with open(f'GRU-[{path[:-4]}]-[Epochs={epochs}]-[LossFunction={now_loss_function}].json', "w") as json_file:
        json_file.write(model_json)
    
    model.save_weights(f'GRU-[{path[:-4]}]-[Epochs={epochs}]-[LossFunction={now_loss_function}]-[weight].h5')
    print("Saved model to disk")

In [28]:
model_save_JSON()

Saved model to disk


# 6. Загрузка модели

In [29]:
def model_load_JSON():
    json_file = open(f'GRU-[{path[:-4]}]-[Epochs={epochs}]-[LossFunction={now_loss_function}].json')
    loaded_model_json = json_file.read()
    json_file.close()
    model_loaded = model_from_json(loaded_model_json)

    model_loaded.load_weights(f'GRU-[{path[:-4]}]-[Epochs={epochs}]-[LossFunction={now_loss_function}]-[weight].h5')
    print("Model loaded")
    
    return model_loaded

In [30]:
model_loaded = model_load_JSON()

Model loaded


In [31]:
model_loaded.summary()

Model: "GRU-Translation"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, None, 50)     47250       ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_embedding (Embedding)  (None, None, 50)     46100       ['decoder_inputs[0][0]']         
                                                                                    

In [32]:
encoder_model = getEncoder(model_loaded)

In [33]:
decoder_model = getDecoder(model_loaded)

In [27]:
with open(f'{path[:-4]}-tokenizer_input.pkl','rb') as f:
    tokenizer_input = pkl.load(f)
with open(f'{path[:-4]}-tokenizer_output.pkl','rb') as f:
    tokenizer_output = pkl.load(f)

reverse_word_map_input = dict(map(reversed, tokenizer_input.word_index.items()))
reverse_word_map_target = dict(map(reversed, tokenizer_output.word_index.items()))

# 7. Проверка

In [28]:
encoder_model.summary()

Model: "Encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, None)]           0         
                                                                 
 encoder_embedding (Embeddin  (None, None, 50)         47250     
 g)                                                              
                                                                 
 encoder_gru (GRU)           [(None, 50),              15300     
                              (None, 50)]                        
                                                                 
Total params: 62,550
Trainable params: 62,550
Non-trainable params: 0
_________________________________________________________________


In [29]:
decoder_model.summary()

Model: "Decoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_embedding (Embedding)  (None, None, 50)     46100       ['decoder_inputs[0][0]']         
                                                                                                  
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 decoder_gru (GRU)              [(None, None, 50),   15300       ['decoder_embedding[1][0]',      
                                 (None, 50)]                      'input_1[0][0]']          

In [40]:
def decode_seq(input_seq):
    state_values_encoder = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0,0] = tokenizer_output.word_index[start_target]
    stop_condition = False
    decoder_sentance = ''
    
    print([target_seq] + state_values_encoder)
    print( decoder_model.predict([target_seq] + state_values_encoder))
    while not stop_condition:
        sample_word, decoder_h = decoder_model.predict([target_seq] + state_values_encoder)
        sample_word_index = np.argmax(sample_word[0,-1,:])
        decoder_word = reverse_word_map_target[sample_word_index]
        decoder_sentance += ' ' + decoder_word
        if (decoder_word == end_target or 
            len(decoder_sentance) > 70):
            stop_condition = True
        target_seq[0, 0] = sample_word_index
        state_values_encoder = [decoder_h]
    return decoder_sentance

In [41]:
for i in range(10):
    sentance = X_test[i]
    original_target = y_test[i]
    input_seq = tokenizer_input.texts_to_sequences([sentance])
    pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
    predicted_target = decode_seq(pad_sequence)
    print("Test sentance: ",i+1)
    print("sentance: ",sentance)
    print("origianl translate:",original_target[5:-5])
    print("predicted Translate:",predicted_target[:-5])
    print("=="*50)

[[[0.97279694 0.91261477 1.32319808 1.03823457 1.02584164 1.17029333
   1.23843992 0.76908514 0.76150651 0.79735586 1.13605289 0.91015819
   1.42006937 0.84848908 1.01905345 1.16627207 1.1340186  0.94678755
   0.69204801 1.10904676 1.32606852 1.22133896 1.1362998  0.44371474
   0.57982853 0.60379171 1.04445478 0.49940991 0.84193429 0.61265934
   1.10971969 1.50798321 0.86740947 1.30370775 1.2688795  0.76367854
   1.10739896 0.89647955 1.09840928 1.39756811 0.78056222 0.91183472
   0.92390061 1.51099956 1.09970696 0.94902347 0.69569978 1.19216922
   1.07049995 1.19394989]]]


ValueError: in user code:

    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "D:\Programms\Anaconda\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "Decoder" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1, 50) dtype=float32>]
