In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Input, Dense,Embedding
from keras.models import Model,load_model
from keras.utils.vis_utils import plot_model
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json
import pickle as pkl
import numpy as np

In [2]:
with open('rus-oss.txt', encoding='utf-8') as f:
    data = f.read()

In [3]:
uncleaned_data_list = data.split('\n')
uncleaned_data_list = uncleaned_data_list[:38695]

rus_word = []
oss_word = []

for word in uncleaned_data_list:
    rus_word.append(word.split('\t')[0])
    oss_word.append(word.split('\t')[1])

In [4]:
language_data = pd.DataFrame(columns=['Russia','Ossetian'])
language_data['Russia'] = rus_word
language_data['Ossetian'] = oss_word

In [5]:
# saving to csv
language_data.to_csv('language_data.csv', index=False)

In [6]:
# loading data from csv
language_data = pd.read_csv('language_data.csv')

In [7]:
language_data.head()

Unnamed: 0,Russia,Ossetian
0,Чего ты смеёшься?,Цæуыл худыс?
1,Этот нож очень острый.,Ацы кард тынг цыргъ у.
2,У кошки девять жизней.,Гæдыйæн фараст царды ис.
3,Сегодня облачно.,Абон у асæст.
4,Он был вождём своего племени 35 лет.,Уый йæ знæмы раздзог уыдис 35 азы дæргъы.


In [8]:
language_data.tail()

Unnamed: 0,Russia,Ossetian
425,Сегодня будет дождь?,Абон уардзæн?
426,Какое сегодня число?,Абон кæцы бон у?
427,Какой сегодня день?,Цы бон у абон?
428,Вы говорите по-осетински?,Иронау дзурут?
429,Вы кого-то ищете?,Искæй агурут?


In [9]:
rus_text = language_data['Russia'].values
oss_text = language_data['Ossetian'].values

In [10]:
rus_text[0], oss_text[0]

('Чего ты смеёшься?', 'Цæуыл худыс?')

In [11]:
#lowercasing the setences
rus_text_ = [x.lower() for x in rus_text]
oss_text_ = [x.lower() for x in oss_text]

In [12]:
rus_text_ = [re.sub("'",'',x) for x in rus_text_]
oss_text_ = [re.sub("'",'',x) for x in oss_text_]

In [13]:
# remove puntuation
def remove_punc(text_list):
    table = str.maketrans('', '', string.punctuation)
    removed_punc_text = []
    for sent in text_list:
        sentance = [w.translate(table) for w in sent.split(' ')]
        removed_punc_text.append(' '.join(sentance))
    return removed_punc_text

In [14]:
rus_text_ = remove_punc(rus_text_)
oss_text_ = remove_punc(oss_text_)

In [15]:
# removing the digits from russian sentances
remove_digits = str.maketrans('', '', digits)
removed_digits_text = []

In [16]:
for sent in rus_text_:
    sentance = [w.translate(remove_digits) for w in sent.split(' ')]
    removed_digits_text.append(' '.join(sentance))
    
rus_text_ = removed_digits_text

In [17]:
rus_text_ = [x.strip() for x in rus_text_]
oss_text_ = [x.strip() for x in oss_text_]

In [18]:
# rus_text_ = ["<sos> " + x + " <eos>" for x in rus_text_]
oss_text_ = ["start " + x + " end" for x in oss_text_]

In [19]:
oss_text_[0], rus_text_[0]

('<sos> цæуыл худыс <eos>', 'чего ты смеёшься')

# Data spliting

In [20]:
X = rus_text_
Y = oss_text_

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.1)
len(X_train),len(y_train), len(X_test), len(y_test)

(387, 387, 43, 43)

In [22]:
X[0], Y[0]

('чего ты смеёшься', '<sos> цæуыл худыс <eos>')

## Data preparing for encoder and decoder

In [29]:
# preparing data for the word embedding
def Max_length(data):
    max_length_ = max([len(x.split(' ')) for x in data])
    return max_length_

In [30]:
#Training data
max_length_rus = Max_length(X_train)
max_lenght_oss = Max_length(y_train)

#Test data
max_length_rus_test = Max_length(X_test)
max_lenght_oss_test = Max_length(y_test)

In [37]:
max_length_rus, max_lenght_oss

(21, 25)

In [34]:
def tokenizer_(text_data):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text_data)
    return tokenizer

tokenizer_input = tokenizer_(X_train)
vocab_size_input = len(tokenizer_input.word_index) + 1
tokenizer_target = tokenizer_(y_train)
vocab_size_target = len(tokenizer_target.word_index) + 1

In [35]:
with open('tokenizer_input.pkl','wb') as f:
    pkl.dump(tokenizer_input,f)

with open('tokenizer_target.pkl','wb') as f:
    pkl.dump(tokenizer_target,f)
    
pkl.dump(tokenizer_input, open('tokenizer_input.pkl', 'wb'))
pkl.dump(tokenizer_target, open('tokenizer_target.pkl', 'wb'))

In [36]:
vocab_size_input, vocab_size_target

(846, 856)

In [38]:
def generator_batch(X= X_train,Y=y_train, batch_size=128):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_data_input = np.zeros((batch_size,max_length_english),dtype='float32') #metrix of batch_size*max_length_english
            decoder_data_input = np.zeros((batch_size,max_lenght_marathi),dtype='float32') #metrix of batch_size*max_length_marathi
            decoder_target_input = np.zeros((batch_size,max_lenght_marathi,vocab_size_target),dtype='float32') # 3d array one hot encoder decoder target data
            for i, (input_text,target_text) in enumerate(zip(X[j:j+batch_size],Y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_data_input[i,t] = tokenizer_input.word_index[word] # Here we are storing the encoder 
                                                                         #seq in row here padding is done automaticaly as 
                                                                         #we have defined col as max_lenght
                for t, word in enumerate(target_text.split()):
                    # if word == 'START_':
                    #   word = 'start'
                    # elif word == 'END_':
                    #   word = 'end'
                    decoder_data_input[i,t] = tokenizer_target.word_index[word] # same for the decoder sequence
                    if t>0:
                        decoder_target_input[i,t-1,tokenizer_target.word_index[word]] = 1 #target is one timestep ahead of decoder input because it does not have 'start tag'
            # print(encoder_data_input.shape())
        yield ([encoder_data_input,decoder_data_input],decoder_target_input)

In [39]:
latent_dim = 50
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,),name="encoder_inputs")
emb_layer_encoder = Embedding(vocab_size_input,latent_dim, mask_zero=True)(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(emb_layer_encoder)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,),name="decoder_inputs")
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
emb_layer_decoder = Embedding(vocab_size_target,latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(emb_layer_decoder, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [40]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [43]:
plot_model(model, to_file='train_model.png', show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [44]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 50

In [45]:
model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs)

  model.fit_generator(generator = generator_batch(X_train, y_train, batch_size = batch_size),


KeyError: '<sos>'