In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

## Load data

In [2]:
translation = pd.read_csv('data/elder_speech.csv')
sentences = pd.read_csv('data/sentences.csv')

In [3]:
sentences.head()

Unnamed: 0,elder_speech,english
0,Aé esse aecáemm taedh.,I will follow you.
1,Caed ess cáelm an hlaith ess elaine.,The forest is peaceful and the lady is beautiful.
2,Aevon ess cáelm.,The river is calm.
3,Aé mire gwyn blath.,I see the white flower.
4,An luned ess og.,A girl is young.


In [4]:
elder_speech = sentences['elder_speech']

## Preprocess

In [5]:
# Get unique characters
characters = list(set(elder_speech.apply(lambda sentence: list(sentence)).sum()))
vocab_size = len(characters)

In [6]:
vocab_size

51

In [7]:
# Tokenize characters
MAX_SEQENCE_LENGTH = elder_speech.apply(lambda sentence: len(sentence)).max()

sequences = []
for sentence in elder_speech.values:
    sequence = []
    for character in sentence:
        sequence.append(characters.index(character))
    sequences.append(sequence)

In [8]:
sequences[0]

[27,
 34,
 17,
 35,
 22,
 22,
 35,
 17,
 18,
 35,
 38,
 19,
 35,
 11,
 11,
 17,
 39,
 18,
 35,
 40,
 45,
 8]

In [9]:
# Create n-gram sequences
n_gram_sequences = []
for sequence in sequences:
    for i in range(len(sequence)-1):
        n_gram_sequences.append(sequence[:i+2])

In [10]:
# Pad sequences
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(n_gram_sequences, maxlen=MAX_SEQENCE_LENGTH, padding='pre')

In [11]:
padded_sequences.shape

(3047, 131)

In [12]:
padded_sequences[:3]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 27, 34],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,

## Model

In [13]:
X_train = padded_sequences[:, :-1]
y_train = tf.keras.utils.to_categorical(padded_sequences[:, -1], num_classes=vocab_size)

In [43]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64, input_length=MAX_SEQENCE_LENGTH-1),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

In [44]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [62]:
model.fit(X_train, y_train, epochs=25, batch_size=32, verbose=1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fa7c082b4f0>

## Generate the Elder Speech

In [63]:
current_sequence = [0]*(MAX_SEQENCE_LENGTH-1)
generated_text = ''

for i in tqdm(range(40)):
    prediction = model.predict(
        np.reshape(current_sequence[-MAX_SEQENCE_LENGTH+1:], (1, MAX_SEQENCE_LENGTH-1)),
        verbose=0
    )
    prediction = np.random.choice(range(vocab_size), p=prediction.ravel())
    
    generated_text += characters[prediction]
    
    current_sequence.append(prediction)

100%|██████████| 40/40 [00:01<00:00, 27.17it/s]


In [64]:
generated_text

'iade y twe tir ess an uniade y treise. T'

In [65]:
translator_dict = dict(zip(translation['elder_speech'], translation['english']))

print('Translation:', " ".join(translator_dict.get(elder_word, '') for elder_word in generated_text.replace('.', '').split()))  

Translation:  of two country is an merger of power 
