We import the needeed package

In [48]:
import numpy as np
from keras.preprocessing.text import Tokenizer

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed

from keras.callbacks.callbacks import ModelCheckpoint


from nltk.translate.bleu_score import corpus_bleu

We load the data

In [3]:
file = open('yemba_english_sentences.txt', mode='rt', encoding='utf-8')
text = file.read()
lines = text.strip().split('\n')
pairs = [line.split(';') for line in lines]

In [11]:
pairs[10]

['Ŋgik ga á nzókó, ńzéŋɛ́ azēŋ.',
 'My older brother spent the whole day dancing only the azeŋ.']

Tokenization

In [12]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    
    return max(len(line.split()) for line in lines)

In [24]:
en_tokenizer = create_tokenizer(pairs[1])
en_vocab_size = len(en_tokenizer.word_index) + 1
en_length = max_length(pairs[1])
print('English Vocabulary Size: %d' % en_vocab_size)
print('English Max Length: %d' % (en_length))

English Vocabulary Size: 8
English Max Length: 4


In [28]:
yb_tokenizer = create_tokenizer(pairs[0])
yb_vocab_size = len(yb_tokenizer.word_index) + 1
yb_length = max_length(pairs[0])
print('Yemba Vocabulary Size: %d' % yb_vocab_size)
print('Yemba Max Length: %d' % (yb_length))

Yemba Vocabulary Size: 9
Yemba Max Length: 4


In [35]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [36]:
# shuffle data
dataset = np.array(pairs)
np.random.shuffle(dataset)
train, test = dataset[:40,:], dataset[40:,:]

# prepare training data
trainX = encode_sequences(en_tokenizer, en_length, train[:, 1])
trainY = encode_sequences(yb_tokenizer, yb_length, train[:, 0])
trainY = encode_output(trainY, yb_vocab_size)

# prepare validation data
testX = encode_sequences(en_tokenizer, en_length, test[:, 1])
testY = encode_sequences(yb_tokenizer, yb_length, test[:, 0])
testY = encode_output(testY, yb_vocab_size)

In [38]:
# define seq2seq model
def define_model(src_vocab, tar_vocab, 
                 source_steps, 
                 target_steps, 
                 embedding_dim):
    model = Sequential()
    # encoder
    model.add(Embedding(src_vocab, embedding_dim, 
                input_length=source_steps, mask_zero=True))
    model.add(LSTM(embedding_dim))
    model.add(RepeatVector(target_steps))
    # decoder
    model.add(LSTM(embedding_dim, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    # compile model
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    # summarize defined model
    model.summary()
    return model

In [41]:
model = define_model(en_vocab_size, yb_vocab_size, en_length, yb_length, 256)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 256)            2048      
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 4, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 4, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 4, 9)              2313      
Total params: 1,054,985
Trainable params: 1,054,985
Non-trainable params: 0
_________________________________________________________________


In [47]:
checkpoint = ModelCheckpoint('model_en_yb.h5', monitor='val_loss', 
                              verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY, epochs=50, batch_size=16, 
                    validation_data=(testX, testY), 
                    callbacks=[checkpoint], verbose=2)

Train on 40 samples, validate on 11 samples
Epoch 1/50
 - 0s - loss: 0.8685 - val_loss: 0.8542

Epoch 00001: val_loss improved from inf to 0.85415, saving model to model_en_yb.h5
Epoch 2/50
 - 0s - loss: 0.8237 - val_loss: 0.7919

Epoch 00002: val_loss improved from 0.85415 to 0.79192, saving model to model_en_yb.h5
Epoch 3/50
 - 0s - loss: 0.7843 - val_loss: 0.7472

Epoch 00003: val_loss improved from 0.79192 to 0.74716, saving model to model_en_yb.h5
Epoch 4/50
 - 0s - loss: 0.7591 - val_loss: 0.7161

Epoch 00004: val_loss improved from 0.74716 to 0.71610, saving model to model_en_yb.h5
Epoch 5/50
 - 0s - loss: 0.7480 - val_loss: 0.6948

Epoch 00005: val_loss improved from 0.71610 to 0.69482, saving model to model_en_yb.h5
Epoch 6/50
 - 0s - loss: 0.7341 - val_loss: 0.6858

Epoch 00006: val_loss improved from 0.69482 to 0.68579, saving model to model_en_yb.h5
Epoch 7/50
 - 0s - loss: 0.7156 - val_loss: 0.6856

Epoch 00007: val_loss improved from 0.68579 to 0.68556, saving model to mo

In [51]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
  
# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [52]:
# evaluate the model
def evaluate_model(model, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, yb_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

evaluate_model(model, trainX, train)

src=[His mother prepared the couscous with the vegetable sauce.], target=[Mmá yi á ndā mésāŋ ńzap.], predicted=[á]
src=[The taro is spoiled.], target=[Apā á pek.], predicted=[á]
src=[The pig has bitten its tibia.], target=[Kúna á lo efe yí.], predicted=[á á]
src=[I kept the oil box at the store.], target=[Meŋ á feŋ ndaŋ a ntaŋ.], predicted=[á]
src=[My older brother spent the whole day dancing only the azeŋ.], target=[Ŋgik ga á nzókó, ńzéŋɛ́ azēŋ.], predicted=[á]
src=[God is the only comforter.], target=[Ndem á si ŋgɔ ndaˈ n̄tsa mezeŋɛ.], predicted=[á]
src=[I bit my tongue.], target=[Meŋ á lo ale zá.], predicted=[]
src=[The feather of ŋgú is very long.], target=[Letɔ́ŋ ŋ́gūˈ é sāk tiˈ.], predicted=[á]
src=[Dongmo and his friend spent the whole day singing in the backwater.], target=[Ndɔŋmɔ̄ pɔ́ esó yi é zók á ńdu nzɔ̄pɔ́.], predicted=[á]
src=[The snake is in the basket.], target=[Nnū é ńte tuŋ.], predicted=[á]
BLEU-1: 0.000000
BLEU-2: 0.000000
B