In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
import os
from sklearn.utils import shuffle
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, GRU, Input, Dense,Embedding
from keras.models import Model,load_model
from tensorflow.keras.utils import plot_model
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json
from nltk.tokenize import RegexpTokenizer

import pickle as pkl
import numpy as np

In [2]:
dir = 'rus-oss.txt'
new_path = f'Date/{dir}'
with open(new_path, encoding='utf-8') as f:
    data = f.read()

In [3]:
uncleaned_data_list = data.split('\n')
uncleaned_data_list = uncleaned_data_list[:38695]

source_word = []
target_word = []
start_target = "sos"
end_target = "eos"

HIDDEN_DIM = 50
batch_size = 6
epochs = 20

for word in uncleaned_data_list:
    source_word.append(word.split('\t')[0])
    target_word.append(word.split('\t')[1])

In [4]:
language_data = pd.DataFrame(columns=['Source','Target'])
language_data['Source'] = source_word
language_data['Target'] = target_word

In [5]:
# saving to csv
language_data.to_csv(f'{dir}-language_data.csv', index=False)

In [6]:
# loading data from csv
language_data = pd.read_csv(f'{dir}-language_data.csv')

In [7]:
language_data.head()

Unnamed: 0,Source,Target
0,Чего ты смеёшься?,Цæуыл худыс?
1,Этот нож очень острый.,Ацы кард тынг цыргъ у.
2,У кошки девять жизней.,Гæдыйæн фараст царды ис.
3,Сегодня облачно.,Абон у асæст.
4,Он был вождём своего племени 35 лет.,Уый йæ знæмы раздзог уыдис 35 азы дæргъы.


In [8]:
language_data.tail()

Unnamed: 0,Source,Target
465,Делать,Кæнын
466,Говорить,Дзурын
467,Работать,Кусын
468,Жить,Цæрын
469,Кушать,Хæрын


In [9]:
source_word = language_data['Source'].values
target_word = language_data['Target'].values

In [10]:
source_word[0], target_word[0]

('Чего ты смеёшься?', 'Цæуыл худыс?')

In [11]:
#lowercasing the setences
source_word_ = [x.lower() for x in source_word]
target_word_ = [x.lower() for x in target_word]

In [12]:
source_word_ = [re.sub("'",'',x) for x in source_word_]
target_word_ = [re.sub("'",'',x) for x in target_word_]

In [13]:
source_word_ = [x.strip() for x in source_word_]
target_word_ = [x.strip() for x in target_word_]

In [14]:
source_word_ = [re.sub(r"[^\w\s]", r"", x) for x in source_word_]
target_word_ = [re.sub(r"[^\w\s]", r"", x) for x in target_word_]

In [15]:
source_word_ = [re.sub(r"\d", r"", x) for x in source_word_]
target_word_ = [re.sub(r"\d", r"", x) for x in target_word_]

In [16]:
source_word_ = [re.sub('"', '', x) for x in source_word_]
target_word_ = [re.sub('"', '', x) for x in target_word_]

In [17]:
source_word_ = [re.sub(r"ӕ", r"æ", x) for x in source_word_]

In [18]:
# source_word_ = [f'{start_target} {x} {end_target}' for x in source_word_]
target_word_ = [f'{start_target} {x} {end_target}' for x in target_word_]

In [19]:
source_word_[0], target_word_[0]

('чего ты смеёшься', 'sos цæуыл худыс eos')

# Data spliting

In [20]:
X = source_word_
Y = target_word_

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1)
len(X_train),len(y_train), len(X_test), len(y_test)

(423, 423, 47, 47)

In [22]:
X[0], Y[0]

('чего ты смеёшься', 'sos цæуыл худыс eos')

## Data preparing for encoder and decoder

In [23]:
# preparing data for the word embedding
def Max_length(data):
    max_length_ = max([len(x.split(' ')) for x in data])
    return max_length_

In [24]:
#Training data
max_lenght_source = Max_length(X_train)
max_lenght_target = Max_length(y_train)

#Test data
max_lenght_source_test = Max_length(X_test)
max_lenght_target_test = Max_length(y_test)

In [25]:
max_lenght_target, max_lenght_source

(25, 21)

In [26]:
def tokenizer_(text_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    return tokenizer

tokenizer_input = tokenizer_(X_train)
vocab_size_input = len(tokenizer_input.word_index) + 1
tokenizer_target = tokenizer_(y_train)
vocab_size_target = len(tokenizer_target.word_index) + 1

In [27]:
with open(f'{dir}-{epochs}-tokenizer_input.pkl','wb') as f:
    pkl.dump(tokenizer_input, f)

with open(f'{dir}-{epochs}-tokenizer_target.pkl','wb') as f:
    pkl.dump(tokenizer_target, f)
    
pkl.dump(tokenizer_input, open(f'{dir}-{epochs}-tokenizer_input.pkl', 'wb'))
pkl.dump(tokenizer_target, open(f'{dir}-{epochs}-tokenizer_target.pkl', 'wb'))

In [28]:
vocab_size_input, vocab_size_target

(874, 886)

In [29]:
def generator_batch(X= X_train,Y=y_train, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_data_input = np.zeros((batch_size,max_lenght_source),dtype='float32') #metrix of batch_size*max_length_english
            decoder_data_input = np.zeros((batch_size,max_lenght_target),dtype='float32') #metrix of batch_size*max_length_marathi
            decoder_target_input = np.zeros((batch_size,max_lenght_target,vocab_size_target),dtype='float32') # 3d array one hot encoder decoder target data
            for i, (input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_data_input[i,t] = tokenizer_input.word_index[word] # Here we are storing the encoder 
                                                                         #seq in row here padding is done automaticaly as 
                                                                         #we have defined col as max_lenght
                for t, word in enumerate(target_text.split()):
                    # if word == 'START_':
                    #   word = 'start'
                    # elif word == 'END_':
                    #   word = 'end'
                    decoder_data_input[i,t] = tokenizer_target.word_index[word] # same for the decoder sequence
                    if t>0:
                        decoder_target_input[i,t-1,tokenizer_target.word_index[word]] = 1 #target is one timestep ahead of decoder input because it does not have 'start tag'
            # print(encoder_data_input.shape())
            yield ([encoder_data_input,decoder_data_input],decoder_target_input)

In [30]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,),name="encoder_inputs")
emb_layer_encoder = Embedding(vocab_size_input, HIDDEN_DIM, mask_zero=True)(encoder_inputs)
encoder = LSTM(HIDDEN_DIM, return_state=True)
encoder_outputs, state_h, state_c = encoder(emb_layer_encoder)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,),name="decoder_inputs")
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
emb_layer_decoder = Embedding(vocab_size_target,HIDDEN_DIM, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(HIDDEN_DIM, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(emb_layer_decoder, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [31]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [32]:
plot_model(model, to_file=f'{dir}-{epochs}-train_model.png', show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [33]:
train_samples = len(X_train)
val_samples = len(X_test)

In [34]:
# %%capture
model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)

Epoch 1/20


  model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x229cce88790>

In [35]:
model_json = model.to_json()
with open(f'{dir}-{epochs}-model.json', "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(f'{dir}-{epochs}-model_weight.h5')
print("Saved model to disk")

Saved model to disk


In [36]:
json_file = open(f'{dir}-{epochs}-model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_loaded = model_from_json(loaded_model_json)

model_loaded.load_weights(f'{dir}-{epochs}-model_weight.h5')

In [37]:
encoder_inputs_inf = model_loaded.input[0]
encoder_outputs_inf, inf_state_h, inf_state_c = model_loaded.layers[4].output
encoder_inf_states = [inf_state_h,inf_state_c]
encoder_model = Model(encoder_inputs_inf,encoder_inf_states)

In [38]:
decoder_state_h_input = Input(shape=(HIDDEN_DIM,))
decoder_state_c_input = Input(shape=(HIDDEN_DIM,))
decoder_state_input = [decoder_state_h_input,decoder_state_c_input]

decoder_input_inf = model_loaded.input[1]
decoder_emb_inf = model_loaded.layers[3](decoder_input_inf)
decoder_lstm_inf = model_loaded.layers[5]
decoder_output_inf, decoder_state_h_inf, decoder_state_c_inf = decoder_lstm_inf(decoder_emb_inf, initial_state =decoder_state_input)
decoder_state_inf = [decoder_state_h_inf,decoder_state_c_inf]
dense_inf = model_loaded.layers[6]
decoder_output_final = dense_inf(decoder_output_inf)

decoder_model = Model([decoder_input_inf]+decoder_state_input,[decoder_output_final]+decoder_state_inf)

In [39]:
with open(f'{dir}-{epochs}-tokenizer_input.pkl','rb') as f:
    tokenizer_input = pkl.load(f)
with open(f'{dir}-{epochs}-tokenizer_target.pkl','rb') as f:
    tokenizer_target = pkl.load(f)

reverse_word_map_input = dict(map(reversed, tokenizer_input.word_index.items()))
reverse_word_map_target = dict(map(reversed, tokenizer_target.word_index.items()))

In [40]:
def decode_seq(input_seq):
    state_values_encoder = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tokenizer_target.word_index[start_target]
    stop_condition = False
    decoder_sentance = ''

    while not stop_condition:
        sample_word , decoder_h,decoder_c= decoder_model.predict([target_seq] + state_values_encoder)
        sample_word_index = np.argmax(sample_word[0,-1,:])
        decoder_word = reverse_word_map_target[sample_word_index]
        decoder_sentance += ' '+ decoder_word
        if (decoder_word == end_target or 
            len(decoder_sentance) > 70):
            stop_condition = True
        target_seq[0, 0] = sample_word_index
        state_values_encoder = [decoder_h,decoder_c]
    return decoder_sentance

In [41]:
for i in range(4):
    sentance = X_test[i]
    original_target = y_test[i]
    input_seq = tokenizer_input.texts_to_sequences([sentance])
    pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
    predicted_target = decode_seq(pad_sequence)
    print("Test sentance: ",i+1)
    print("sentance: ",sentance)
    print("origianl translate:",original_target[3:-3])
    print("predicted Translate:",predicted_target[:-3])
    print("=="*50)

Test sentance:  1
sentance:  завтра надо идти на работу
origianl translate:  райсом мæ куыстмæ цæуын хъæуы 
predicted Translate:  байрай кæдæм 
Test sentance:  2
sentance:  знания просто так  к человеку не приходят
origianl translate:  зонындзинæдтæ адæймагмæ сæхæдæг не рцæуынц 
predicted Translate:  кæм ис ис 
Test sentance:  3
sentance:  всего хорошего
origianl translate:  дзæбæх ут 
predicted Translate:  кæсын 
Test sentance:  4
sentance:  родители купили ацамазу портфель
origianl translate:  ацæмæзæн ныййарджытæ балхæдтой пъартфел 
predicted Translate:  батырбег 


In [42]:
for i in range(20):
    sentance = X_train[i]
    original_target = y_train[i]
    input_seq = tokenizer_input.texts_to_sequences([sentance])
    pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
    predicted_target = decode_seq(pad_sequence)
    print("Test sentance: ",i+1)
    print("sentance: ",sentance)
    print("origianl translate:",original_target[3:-3])
    print("predicted Translate:",predicted_target[:-3])
    print("=="*50)

Test sentance:  1
sentance:  спасибо кола а сам ты нигде не был
origianl translate:  бузныг къола æмæ дæхæдæг та никуы уыдтæ 
predicted Translate:  мæ фыд æмæ æмæ æмæ æмæ æмæ æмæ фæнды хъæбулы хъæбулы хъæбулы хъæбулы хъæб
Test sentance:  2
sentance:  каждый год там бываешь
origianl translate:  уырдæм фæцæуыс алы аз дæр 
predicted Translate:  æз дæр хорз у 
Test sentance:  3
sentance:  спеть вам песенку
origianl translate:  азарон уын зарæг 
predicted Translate:  дзæуджыхъæу кæдæм 
Test sentance:  4
sentance:  все хорошо спасибо
origianl translate:  бузныг ницы мын у 
predicted Translate:  æз 
Test sentance:  5
sentance:  она была моей первой любовью
origianl translate:  уый уыди мæ фыццаг уарзондзинат 
predicted Translate:  мæ фыд æмæ цæры йæ 
Test sentance:  6
sentance:  у тебя есть я
origianl translate:  дӕумӕ ӕз дӕн 
predicted Translate:  том фæнды 
Test sentance:  7
sentance:  этот нож очень острый
origianl translate:  ацы кард тынг цыргъ у 
predicted Translate:  том нæ ис 
Test se

In [44]:
# sentance = str(input())
# input_seq = tokenizer_input.texts_to_sequences([sentance])
# pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
# predicted_target = decode_seq(pad_sequence)
# print("predicted Translate:",predicted_target[:-3])