## Import libraries
***

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

## Load data
***

In [2]:
translation = pd.read_csv('data/elder_speech.csv')
sentences = pd.read_csv('data/sentences.csv')

In [3]:
sentences.head()

Unnamed: 0,elder_speech,english
0,Aé esse aecáemm taedh.,I will follow you.
1,Caed ess cáelm an hlaith ess elaine.,The forest is peaceful and the lady is beautiful.
2,Aevon ess cáelm.,The river is calm.
3,Aé mire gwyn blath.,I see the white flower.
4,An luned ess og.,A girl is young.


In [4]:
elder_speech = sentences['elder_speech']

## Preprocess
***

In [5]:
# Get unique characters
characters = list(set(elder_speech.apply(lambda sentence: list(sentence)).sum()))
vocab_size = len(characters)

In [6]:
vocab_size

51

In [7]:
# Tokenize characters
MAX_SEQENCE_LENGTH = elder_speech.apply(lambda sentence: len(sentence)).max()

sequences = []
for sentence in elder_speech.values:
    sequence = []
    for character in sentence:
        sequence.append(characters.index(character))
    sequences.append(sequence)

In [8]:
sequences[0]

[21,
 40,
 50,
 22,
 36,
 36,
 22,
 50,
 9,
 22,
 19,
 23,
 22,
 24,
 24,
 50,
 15,
 9,
 22,
 2,
 11,
 17]

In [9]:
# Create n-gram sequences
n_gram_sequences = []
for sequence in sequences:
    for i in range(len(sequence)-1):
        n_gram_sequences.append(sequence[:i+2])

In [10]:
# Pad sequences
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(n_gram_sequences, maxlen=MAX_SEQENCE_LENGTH, padding='pre')

In [11]:
padded_sequences.shape

(3047, 131)

In [12]:
padded_sequences[:3]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 21, 40],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,

## Model
***

In [13]:
X_train = padded_sequences[:, :-1]
y_train = tf.keras.utils.to_categorical(padded_sequences[:, -1], num_classes=vocab_size)

In [182]:
# Input layer
inputs = tf.keras.layers.Input(shape=(MAX_SEQENCE_LENGTH-1,))

# Embedding layer
x = tf.keras.layers.Embedding(vocab_size, 300, input_length=MAX_SEQENCE_LENGTH-1)(inputs)

# LSTM block 1
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.LayerNormalization()(x)

# LSTM block 2
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.LayerNormalization()(x)

# Softmax output layer
output = tf.keras.layers.Dense(vocab_size, activation='softmax')(x)

# Model
model = tf.keras.Model(inputs=inputs, outputs=output)

In [168]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [169]:
model.fit(X_train, y_train, epochs=30, batch_size=128, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fcc011c4d30>

## Generate the Elder Speech
***

In [170]:
def generate_text(text_start, name_len):
    """Generates Elder Speech of desired length and starting letters."""
    sequence = [0] * (MAX_SEQENCE_LENGTH-len(text_start)) + [characters.index(c) for c in text_start]
    text = text_start

    for i in tqdm(range(name_len)):
        prediction = model.predict(
            np.reshape(sequence[-MAX_SEQENCE_LENGTH+1:], (1, MAX_SEQENCE_LENGTH-1)), # current_sequence is of shape (MAXSEQLEN, ) and input to the model must be of shape (n, MAXSEQLEN-1), n - is a batch size. We want to predict only 1 exaple so batch size is 1.
            verbose=0
        )
        prediction = np.random.choice(range(vocab_size), p=prediction.ravel()) # Pick a number from [0, vocab_size) using probability distribution of a softmax output so that our output isn't the same all the time

        text += characters[prediction]

        sequence.append(prediction)
        
    return text

In [180]:
generated_text = generate_text('Ceádmil', 40)
generated_text

100%|██████████| 40/40 [00:01<00:00, 27.98it/s]


"Ceádmil vatt'ghern nau aé mire aen an blathanan"

In [181]:
translator_dict = dict(zip(translation['elder_speech'], translation['english']))

# Replace multiple spaces with one
print('Translation:', " ".join(translator_dict.get(elder_word, '') for elder_word in generated_text.lower().replace('.', '').split()).replace('  ', ' '))  

Translation: greetings witcher I observe as an 
