In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
import os
from sklearn.utils import shuffle
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, GRU, Input, Dense,Embedding
from keras.models import Model,load_model
from tensorflow.keras.utils import plot_model
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json
from nltk.tokenize import RegexpTokenizer

import pickle as pkl
import numpy as np

In [2]:
dir = 'rus-oss.txt'
cur_path = os.path.abspath('')
new_path = os.path.relpath(f'../Date/{dir}', cur_path)
with open(new_path, encoding='utf-8') as f:
    data = f.read()

In [3]:
uncleaned_data_list = data.split('\n')
uncleaned_data_list = uncleaned_data_list[:38695]

source_word = []
target_word = []
start_target = "sos"
end_target = "eos"

HIDDEN_DIM = 50
WORDS = 1000
LENGTH = 100
DEPTH = 32

batch_size = 6
epochs = 160

for word in uncleaned_data_list:
    source_word.append(word.split('\t')[0])
    target_word.append(word.split('\t')[1])

In [4]:
language_data = pd.DataFrame(columns=['Source','Target'])
language_data['Source'] = source_word
language_data['Target'] = target_word

In [5]:
# saving to csv
language_data.to_csv(f'{dir}-language_data.csv', index=False)

In [6]:
# loading data from csv
language_data = pd.read_csv(f'{dir}-language_data.csv')

In [7]:
language_data.head()

Unnamed: 0,Source,Target
0,Чего ты смеёшься?,Цæуыл худыс?
1,Этот нож очень острый.,Ацы кард тынг цыргъ у.
2,У кошки девять жизней.,Гæдыйæн фараст царды ис.
3,Сегодня облачно.,Абон у асæст.
4,Он был вождём своего племени 35 лет.,Уый йæ знæмы раздзог уыдис 35 азы дæргъы.


In [8]:
language_data.tail()

Unnamed: 0,Source,Target
465,Делать,Кæнын
466,Говорить,Дзурын
467,Работать,Кусын
468,Жить,Цæрын
469,Кушать,Хæрын


In [9]:
source_word = language_data['Source'].values
target_word = language_data['Target'].values

In [10]:
source_word[0], target_word[0]

('Чего ты смеёшься?', 'Цæуыл худыс?')

In [11]:
#lowercasing the setences
source_word_ = [x.lower() for x in source_word]
target_word_ = [x.lower() for x in target_word]

In [12]:
source_word_ = [re.sub("'",'',x) for x in source_word_]
target_word_ = [re.sub("'",'',x) for x in target_word_]

In [13]:
source_word_ = [x.strip() for x in source_word_]
target_word_ = [x.strip() for x in target_word_]

In [14]:
source_word_ = [re.sub(r"[^\w\s]", r"", x) for x in source_word_]
target_word_ = [re.sub(r"[^\w\s]", r"", x) for x in target_word_]

In [15]:
source_word_ = [re.sub(r"\d", r"", x) for x in source_word_]
target_word_ = [re.sub(r"\d", r"", x) for x in target_word_]

In [16]:
source_word_ = [re.sub('"', '', x) for x in source_word_]
target_word_ = [re.sub('"', '', x) for x in target_word_]

In [17]:
source_word_ = [re.sub(r"ӕ", r"æ", x) for x in source_word_]

In [18]:
# source_word_ = [f'{start_target} {x} {end_target}' for x in source_word_]
target_word_ = [f'{start_target} {x} {end_target}' for x in target_word_]

In [19]:
source_word_[0], target_word_[0]

('чего ты смеёшься', 'sos цæуыл худыс eos')

# Data spliting

In [20]:
X = source_word_
Y = target_word_

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1)
len(X_train),len(y_train), len(X_test), len(y_test)

(423, 423, 47, 47)

In [22]:
X[0], Y[0]

('чего ты смеёшься', 'sos цæуыл худыс eos')

## Data preparing for encoder and decoder

In [23]:
# preparing data for the word embedding
def Max_length(data):
    max_length_ = max([len(x.split(' ')) for x in data])
    return max_length_

In [24]:
#Training data
max_lenght_source = Max_length(X_train)
max_lenght_target = Max_length(y_train)

#Test data
max_lenght_source_test = Max_length(X_test)
max_lenght_target_test = Max_length(y_test)

In [25]:
max_lenght_target, max_lenght_source

(25, 21)

In [26]:
def tokenizer_(text_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    return tokenizer

tokenizer_input = tokenizer_(X_train)
vocab_size_input = len(tokenizer_input.word_index) + 1
tokenizer_target = tokenizer_(y_train)
vocab_size_target = len(tokenizer_target.word_index) + 1

In [27]:
with open(f'{dir}-{epochs}-tokenizer_input.pkl','wb') as f:
    pkl.dump(tokenizer_input, f)

with open(f'{dir}-{epochs}-tokenizer_target.pkl','wb') as f:
    pkl.dump(tokenizer_target, f)
    
pkl.dump(tokenizer_input, open(f'{dir}-{epochs}-tokenizer_input.pkl', 'wb'))
pkl.dump(tokenizer_target, open(f'{dir}-{epochs}-tokenizer_target.pkl', 'wb'))

In [28]:
vocab_size_input, vocab_size_target

(879, 891)

In [29]:
def generator_batch(X= X_train,Y=y_train, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_data_input = np.zeros((batch_size,max_lenght_source),dtype='float32') #metrix of batch_size*max_length_english
            decoder_data_input = np.zeros((batch_size,max_lenght_target),dtype='float32') #metrix of batch_size*max_length_marathi
            decoder_target_input = np.zeros((batch_size,max_lenght_target,vocab_size_target),dtype='float32') # 3d array one hot encoder decoder target data
            for i, (input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_data_input[i,t] = tokenizer_input.word_index[word] # Here we are storing the encoder 
                                                                         #seq in row here padding is done automaticaly as 
                                                                         #we have defined col as max_lenght
                for t, word in enumerate(target_text.split()):
                    # if word == 'START_':
                    #   word = 'start'
                    # elif word == 'END_':
                    #   word = 'end'
                    decoder_data_input[i,t] = tokenizer_target.word_index[word] # same for the decoder sequence
                    if t>0:
                        decoder_target_input[i,t-1,tokenizer_target.word_index[word]] = 1 #target is one timestep ahead of decoder input because it does not have 'start tag'
            # print(encoder_data_input.shape())
            yield ([encoder_data_input,decoder_data_input],decoder_target_input)

In [30]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,),name="encoder_inputs")
emb_layer_encoder = Embedding(vocab_size_input, HIDDEN_DIM, mask_zero=True)(encoder_inputs)
encoder = GRU(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h = encoder(emb_layer_encoder)
# encoder_outputs, state_h, state_c = encoder(emb_layer_encoder)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,),name="decoder_inputs")
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
emb_layer_decoder = Embedding(vocab_size_target,HIDDEN_DIM, mask_zero=True)(decoder_inputs)
decoder_gru = GRU(HIDDEN_DIM, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(emb_layer_decoder, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [31]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [32]:
plot_model(model, to_file=f'{dir}-{epochs}-train_model.png', show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [33]:
train_samples = len(X_train)
val_samples = len(X_test)

In [34]:
# %%capture
model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)

Epoch 1/160


  model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)


Epoch 2/160
Epoch 3/160
Epoch 4/160
Epoch 5/160
Epoch 6/160
Epoch 7/160
Epoch 8/160
Epoch 9/160
Epoch 10/160
Epoch 11/160
Epoch 12/160
Epoch 13/160
Epoch 14/160
Epoch 15/160
Epoch 16/160
Epoch 17/160
Epoch 18/160
Epoch 19/160
Epoch 20/160
Epoch 21/160
Epoch 22/160
Epoch 23/160
Epoch 24/160
Epoch 25/160
Epoch 26/160
Epoch 27/160
Epoch 28/160
Epoch 29/160
Epoch 30/160
Epoch 31/160
Epoch 32/160
Epoch 33/160
Epoch 34/160
Epoch 35/160
Epoch 36/160
Epoch 37/160
Epoch 38/160
Epoch 39/160
Epoch 40/160
Epoch 41/160
Epoch 42/160
Epoch 43/160
Epoch 44/160
Epoch 45/160
Epoch 46/160
Epoch 47/160
Epoch 48/160
Epoch 49/160
Epoch 50/160
Epoch 51/160
Epoch 52/160
Epoch 53/160
Epoch 54/160
Epoch 55/160
Epoch 56/160
Epoch 57/160
Epoch 58/160
Epoch 59/160
Epoch 60/160
Epoch 61/160
Epoch 62/160
Epoch 63/160
Epoch 64/160
Epoch 65/160
Epoch 66/160
Epoch 67/160
Epoch 68/160
Epoch 69/160
Epoch 70/160
Epoch 71/160
Epoch 72/160
Epoch 73/160
Epoch 74/160
Epoch 75/160
Epoch 76/160
Epoch 77/160
Epoch 78/160
Epoch 7

<keras.callbacks.History at 0x292e8062550>

In [44]:
model_json = model.to_json()
with open(f'{dir}-{epochs}-model.json', "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(f'{dir}-{epochs}-model_weight.h5')
print("Saved model to disk")

Saved model to disk


In [45]:
json_file = open(f'{dir}-{epochs}-model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_loaded = model_from_json(loaded_model_json)

model_loaded.load_weights(f'{dir}-{epochs}-model_weight.h5')

In [62]:
encoder_inputs_inf = model_loaded.input[0]
encoder_outputs_inf, inf_state_h = model_loaded.layers[4].output
encoder_inf_states = [inf_state_h]
encoder_model = Model(encoder_inputs_inf,encoder_inf_states)

In [63]:
decoder_state_h_input = Input(shape=(HIDDEN_DIM,))
decoder_state_c_input = Input(shape=(HIDDEN_DIM,))
decoder_state_input = [decoder_state_h_input]

decoder_input_inf = model_loaded.input[1]
decoder_emb_inf = model_loaded.layers[3](decoder_input_inf)
decoder_gru_inf = model_loaded.layers[5]
decoder_output_inf, decoder_state_h_inf = decoder_gru_inf(decoder_emb_inf, initial_state=decoder_state_input)
decoder_state_inf = [decoder_state_h_inf]
dense_inf = model_loaded.layers[6]
decoder_output_final = dense_inf(decoder_output_inf)

decoder_model = Model([decoder_input_inf] + decoder_state_input, [decoder_output_final] + decoder_state_inf)

In [64]:
with open(f'{dir}-{epochs}-tokenizer_input.pkl','rb') as f:
    tokenizer_input = pkl.load(f)
with open(f'{dir}-{epochs}-tokenizer_target.pkl','rb') as f:
    tokenizer_target = pkl.load(f)

reverse_word_map_input = dict(map(reversed, tokenizer_input.word_index.items()))
reverse_word_map_target = dict(map(reversed, tokenizer_target.word_index.items()))

In [69]:
def decode_seq(input_seq):
    state_values_encoder = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tokenizer_target.word_index[start_target]
    stop_condition = False
    decoder_sentance = ''

    while not stop_condition:
        sample_word, decoder_h = decoder_model.predict([target_seq] + state_values_encoder)
        sample_word_index = np.argmax(sample_word[0,-1,:])
        decoder_word = reverse_word_map_target[sample_word_index]
        decoder_sentance += ' '+ decoder_word
        if (decoder_word == end_target or 
            len(decoder_sentance) > 70):
            stop_condition = True
        target_seq[0, 0] = sample_word_index
        state_values_encoder = [decoder_h]
        
    return decoder_sentance

In [70]:
for i in range(4):
    sentance = X_test[i]
    original_target = y_test[i]
    input_seq = tokenizer_input.texts_to_sequences([sentance])
    pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
    print(sentance)
    print(pad_sequence)
    predicted_target = decode_seq(pad_sequence)
    print("Test sentance: ",i+1)
    print("sentance: ",sentance)
    print("origianl translate:",original_target[3:-3])
    print("predicted Translate:",predicted_target[:-3])
    print("=="*50)

она говорит порусски
[[ 28  56 513   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0]]


ValueError: in user code:

    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "D:\Programms\Anaconda\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "D:\Programms\Anaconda\lib\site-packages\keras\engine\input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "model_8" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 50) dtype=float32>]


In [None]:
for i in range(20):
    sentance = X_train[i]
    original_target = y_train[i]
    input_seq = tokenizer_input.texts_to_sequences([sentance])
    pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
    predicted_target = decode_seq(pad_sequence)
    print("Test sentance: ",i+1)
    print("sentance: ",sentance)
    print("origianl translate:",original_target[3:-3])
    print("predicted Translate:",predicted_target[:-3])
    print("=="*50)

In [None]:
sentance = str(input())
input_seq = tokenizer_input.texts_to_sequences([sentance])
pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
predicted_target = decode_seq(pad_sequence)
print("predicted Translate:",predicted_target[:-3])