In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
import os
from sklearn.utils import shuffle
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Input, Dense,Embedding
from keras.models import Model,load_model
from tensorflow.keras.utils import plot_model
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json
import pickle as pkl
import numpy as np

In [2]:
dir = 'rus-oss.txt'
cur_path = os.path.abspath('')
new_path = os.path.relpath(f'../Date/{dir}', cur_path)
with open(new_path, encoding='utf-8') as f:
    data = f.read()

In [3]:
uncleaned_data_list = data.split('\n')
uncleaned_data_list = uncleaned_data_list[:38695]

rus_word = []
oss_word = []

for word in uncleaned_data_list:
    rus_word.append(word.split('\t')[0])
    oss_word.append(word.split('\t')[1])

In [4]:
language_data = pd.DataFrame(columns=['Russia','Ossetian'])
language_data['Russia'] = rus_word
language_data['Ossetian'] = oss_word

In [5]:
# saving to csv
language_data.to_csv(f'{dir}-language_data.csv', index=False)

In [6]:
# loading data from csv
language_data = pd.read_csv(f'{dir}-language_data.csv')

In [7]:
language_data.head()

Unnamed: 0,Russia,Ossetian
0,Чего ты смеёшься?,Цæуыл худыс?
1,Этот нож очень острый.,Ацы кард тынг цыргъ у.
2,У кошки девять жизней.,Гæдыйæн фараст царды ис.
3,Сегодня облачно.,Абон у асæст.
4,Он был вождём своего племени 35 лет.,Уый йæ знæмы раздзог уыдис 35 азы дæргъы.


In [8]:
language_data.tail()

Unnamed: 0,Russia,Ossetian
425,Сегодня будет дождь?,Абон уардзæн?
426,Какое сегодня число?,Абон кæцы бон у?
427,Какой сегодня день?,Цы бон у абон?
428,Вы говорите по-осетински?,Иронау дзурут?
429,Вы кого-то ищете?,Искæй агурут?


In [9]:
rus_text = language_data['Russia'].values
oss_text = language_data['Ossetian'].values

In [10]:
rus_text[0], oss_text[0]

('Чего ты смеёшься?', 'Цæуыл худыс?')

In [11]:
#lowercasing the setences
rus_text_ = [x.lower() for x in rus_text]
oss_text_ = [x.lower() for x in oss_text]

In [12]:
rus_text_ = [re.sub("'",'',x) for x in rus_text_]
oss_text_ = [re.sub("'",'',x) for x in oss_text_]

In [13]:
# remove puntuation
def remove_punc(text_list):
    table = str.maketrans('', '', string.punctuation)
    removed_punc_text = []
    for sent in text_list:
        sentance = [w.translate(table) for w in sent.split(' ')]
        removed_punc_text.append(' '.join(sentance))
    return removed_punc_text

In [14]:
rus_text_ = remove_punc(rus_text_)
oss_text_ = remove_punc(oss_text_)

In [15]:
# removing the digits from russian sentances
remove_digits = str.maketrans('', '', digits)
removed_digits_text = []

In [16]:
for sent in rus_text_:
    sentance = [w.translate(remove_digits) for w in sent.split(' ')]
    removed_digits_text.append(' '.join(sentance))
    
rus_text_ = removed_digits_text

In [17]:
rus_text_ = [x.strip() for x in rus_text_]
oss_text_ = [x.strip() for x in oss_text_]

In [18]:
rus_text_ = [re.sub(r"([,.!?])", r"", x) for x in rus_text_]
oss_text_ = [re.sub(r"([,.!?])", r"", x) for x in oss_text_]

In [19]:
# rus_text_ = ["<sos> " + x + " <eos>" for x in rus_text_]
oss_text_ = ["start " + x + " end" for x in oss_text_]

In [20]:
oss_text_[0], rus_text_[0]

('start цæуыл худыс end', 'чего ты смеёшься')

# Data spliting

In [21]:
X = rus_text_
Y = oss_text_

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.1)
len(X_train),len(y_train), len(X_test), len(y_test)

(387, 387, 43, 43)

In [23]:
X[0], Y[0]

('чего ты смеёшься', 'start цæуыл худыс end')

## Data preparing for encoder and decoder

In [24]:
# preparing data for the word embedding
def Max_length(data):
    max_length_ = max([len(x.split(' ')) for x in data])
    return max_length_

In [25]:
#Training data
max_length_rus = Max_length(X_train)
max_lenght_oss = Max_length(y_train)

#Test data
max_length_rus_test = Max_length(X_test)
max_lenght_oss_test = Max_length(y_test)

In [26]:
max_length_rus, max_lenght_oss

(21, 25)

In [27]:
def tokenizer_(text_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    return tokenizer

tokenizer_input = tokenizer_(X_train)
vocab_size_input = len(tokenizer_input.word_index) + 1
tokenizer_target = tokenizer_(y_train)
vocab_size_target = len(tokenizer_target.word_index) + 1

In [28]:
with open(f'{dir}-tokenizer_input.pkl','wb') as f:
    pkl.dump(tokenizer_input,f)

with open(f'{dir}-tokenizer_target.pkl','wb') as f:
    pkl.dump(tokenizer_target,f)
    
pkl.dump(tokenizer_input, open(f'{dir}-tokenizer_input.pkl', 'wb'))
pkl.dump(tokenizer_target, open(f'{dir}-tokenizer_target.pkl', 'wb'))

In [29]:
vocab_size_input, vocab_size_target

(862, 864)

In [30]:
def generator_batch(X= X_train,Y=y_train, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_data_input = np.zeros((batch_size,max_length_rus),dtype='float32') #metrix of batch_size*max_length_english
            decoder_data_input = np.zeros((batch_size,max_lenght_oss),dtype='float32') #metrix of batch_size*max_length_marathi
            decoder_target_input = np.zeros((batch_size,max_lenght_oss,vocab_size_target),dtype='float32') # 3d array one hot encoder decoder target data
            for i, (input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_data_input[i,t] = tokenizer_input.word_index[word] # Here we are storing the encoder 
                                                                         #seq in row here padding is done automaticaly as 
                                                                         #we have defined col as max_lenght
                for t, word in enumerate(target_text.split()):
                    # if word == 'START_':
                    #   word = 'start'
                    # elif word == 'END_':
                    #   word = 'end'
                    decoder_data_input[i,t] = tokenizer_target.word_index[word] # same for the decoder sequence
                    if t>0:
                        decoder_target_input[i,t-1,tokenizer_target.word_index[word]] = 1 #target is one timestep ahead of decoder input because it does not have 'start tag'
            # print(encoder_data_input.shape())
        yield ([encoder_data_input,decoder_data_input],decoder_target_input)

In [31]:
latent_dim = 50
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,),name="encoder_inputs")
emb_layer_encoder = Embedding(vocab_size_input,latent_dim, mask_zero=True)(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(emb_layer_encoder)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,),name="decoder_inputs")
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
emb_layer_decoder = Embedding(vocab_size_target,latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(emb_layer_decoder, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [32]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [33]:
plot_model(model, to_file=f'{dir}-train_model.png', show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [34]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 10
epochs = 100

In [35]:
# %%capture
model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)

Epoch 1/100


  model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size), steps_per_epoch = train_samples//batch_size, epochs=epochs)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x1f7489e0d60>

In [36]:
model_json = model.to_json()
with open(f'{dir}-model_2.json', "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(f'{dir}-model_weight_5.h5')
print("Saved model to disk")

Saved model to disk


In [37]:
# loading the model architecture and asigning the weights
json_file = open(f'{dir}-model_2.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_loaded = model_from_json(loaded_model_json)
# load weights into new model
model_loaded.load_weights(f'{dir}-model_weight_5.h5')

In [38]:
latent_dim = 50
#inference encoder
encoder_inputs_inf = model_loaded.input[0] #Trained encoder input layer
encoder_outputs_inf, inf_state_h, inf_state_c = model_loaded.layers[4].output # retoring the encoder lstm output and states
encoder_inf_states = [inf_state_h,inf_state_c]
encoder_model = Model(encoder_inputs_inf,encoder_inf_states)

In [39]:
latent_dim = 50
#inference encoder
encoder_inputs_inf = model_loaded.input[0] #Trained encoder input layer
encoder_outputs_inf, inf_state_h, inf_state_c = model_loaded.layers[4].output # retoring the encoder lstm output and states
encoder_inf_states = [inf_state_h,inf_state_c]
encoder_model = Model(encoder_inputs_inf,encoder_inf_states)

In [40]:
#inference decoder
# The following tensor will store the state of the previous timestep in the "starting the encoder final time step"
decoder_state_h_input = Input(shape=(latent_dim,)) #becase during training we have set the lstm unit to be of 50
decoder_state_c_input = Input(shape=(latent_dim,))
decoder_state_input = [decoder_state_h_input,decoder_state_c_input]

# # inference decoder input
decoder_input_inf = model_loaded.input[1] #Trained decoder input layer
# decoder_input_inf._name='decoder_input'
decoder_emb_inf = model_loaded.layers[3](decoder_input_inf)
decoder_lstm_inf = model_loaded.layers[5]
decoder_output_inf, decoder_state_h_inf, decoder_state_c_inf = decoder_lstm_inf(decoder_emb_inf, initial_state =decoder_state_input)
decoder_state_inf = [decoder_state_h_inf,decoder_state_c_inf]
#inference dense layer
dense_inf = model_loaded.layers[6]
decoder_output_final = dense_inf(decoder_output_inf)# A dense softmax layer to generate prob dist. over the target vocabulary

decoder_model = Model([decoder_input_inf]+decoder_state_input,[decoder_output_final]+decoder_state_inf)

In [41]:
with open(f'{dir}-tokenizer_input.pkl','rb') as f:
    tokenizer_input = pkl.load(f)
with open(f'{dir}-tokenizer_target.pkl','rb') as f:
    tokenizer_target = pkl.load(f)
# Creating the reverse mapping to get the word from the index in the sequence
reverse_word_map_input = dict(map(reversed, tokenizer_input.word_index.items()))
reverse_word_map_target = dict(map(reversed, tokenizer_target.word_index.items()))

In [42]:
def decode_seq(input_seq):
    # print("input_seq=>",input_seq)
    state_values_encoder = encoder_model.predict(input_seq)
    # intialize the target seq with start tag
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = tokenizer_target.word_index['start']
    # print("target_seq:=>",target_seq)
    stop_condition = False
    decoder_sentance = ''
    # print("Beforee the while loop")
    while not stop_condition:
        sample_word , decoder_h,decoder_c= decoder_model.predict([target_seq] + state_values_encoder)
        # print("sample_word: =>",sample_word)
        sample_word_index = np.argmax(sample_word[0,-1,:])
        # print("sample_word_index: ",sample_word_index)
        decoder_word = reverse_word_map_target[sample_word_index]
        decoder_sentance += ' '+ decoder_word
        # print("decoded word:=>",decoder_word)
        # print(len(decoder_sentance))
        # print("len(decoder_sentance) > 70: ",len(decoder_sentance) > 70)
        # print('decoder_word == "end"',decoder_word == 'end')
        # print(decoder_word == 'end' or len(decoder_sentance) > 70)
        # stop condition for the while loop
        if (decoder_word == 'end' or 
            len(decoder_sentance) > 70):
            stop_condition = True
            # print("from if condition")
        # target_seq = np.zeros((1,1))
        target_seq[0, 0] = sample_word_index
        # print(target_seq)
        state_values_encoder = [decoder_h,decoder_c]
    return decoder_sentance

In [43]:
for i in range(10):
    sentance = X_test[i]
    original_target = y_test[i]
    input_seq = tokenizer_input.texts_to_sequences([sentance])
    pad_sequence = pad_sequences(input_seq, maxlen= 30, padding='post')
    # print('input_sequence =>',input_seq)
    # print("pad_seq=>",pad_sequence)
    predicted_target = decode_seq(pad_sequence)
    print("Test sentance: ",i+1)
    print("sentance: ",sentance)
    print("origianl translate:",original_target[6:-4])
    print("predicted Translate:",predicted_target[:-4])
    print("=="*50)

Test sentance:  1
sentance:  столицей чего является владикавказ
origianl translate: дзæуджыхъæу кæй сæйраг горæт у
predicted Translate:  том мæнæй бирæ хистæрдæр у
Test sentance:  2
sentance:  после завтра мне надо пойти в лес
origianl translate: иннæ бон мæ хъæумæ ныууайын хъæуы
predicted Translate:  æхсæрдæс азы
Test sentance:  3
sentance:  он жил здесь десять лет назад
origianl translate: уый ам царди дæс азы размæ
predicted Translate:  том махӕй никӕйы нӕ базыдта
Test sentance:  4
sentance:  я хотел бы ей позвонить у тебя есть её номер телефона
origianl translate: мæн фæндыди йæм бадзурын йæ номыр дæм ис
predicted Translate:  ӕз нӕ зонын кӕм дӕн
Test sentance:  5
sentance:  на чью учебу влияют учителя
origianl translate: кæй ахуырыл аудынц ахуыргæнджытæ
predicted Translate:  том махӕй никӕйы нӕ базыдта
Test sentance:  6
sentance:  он был вождём своего племени  лет
origianl translate: уый йæ знæмы раздзог уыдис 35 азы дæргъы
predicted Translate:  том мæнæй бирæ хистæрдæр у
Test sent