<a href="https://colab.research.google.com/github/Tirabyte/QuenyaTranslate/blob/main/QuenyaEnglishMachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
cd /content/drive/MyDrive/Quenya/

/content/drive/MyDrive/Quenya


In [None]:
import unicodedata
import re
import string
import numpy as np
from pickle import load
from pickle import dump
from numpy.random import shuffle
from numpy import array

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

 # split a loaded document into sentences
def to_pairs(doc):
 lines = doc.strip().split('\n')
 pairs = [line.split( '\t' ) for line in lines]
 return pairs

 # clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('utf-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [re_punc.sub('', w) for w in line]
            # remove non-printable chars from each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]


            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    print(cleaned)
    return array(cleaned)

# load a clean dataset
def load_clean_sentences(filename):
 return load(open(filename, 'rb'))
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    with open(filename, 'wb') as f:
        dump(sentences, f)
    #print('Saved: %s' % filename)

 # load dataset
filename = 'Eldamo Dataset - RawData.tsv'
doc = load_doc(filename)
 # split into english-german pairs
pairs = to_pairs(doc)
 # clean sentences
clean_pairs = clean_pairs(pairs)
 # save clean pairs to file
save_clean_data(clean_pairs, 'quenya-english.pkl' )
 # spot check
# spot check
for i in range(len(clean_pairs)):
    # print(clean_pairs[i][0])
    pass

 # load dataset
raw_dataset = load_clean_sentences('quenya-english.pkl')

train = raw_dataset[:int(len(raw_dataset)*0.8)]
test =  raw_dataset[int(len(raw_dataset)*0.8):]

# save
save_clean_data(train, 'quenya-english-train.pkl')
save_clean_data(test, 'quenya-english-test.pkl')


[['macili cirir orcor', 'swords cut orcs'], ['i elda tuve osto', 'the elf finds a city'], ['silan', 'i am shining'], ['i aran tira aiwi', 'the king is watching birds'], ['i naucor norar', 'the dwarves are running'], ['atani samir coar', 'men have houses'], ['tirin eleni', 'i watch stars'], ['i aiwi lorir', 'the birds sleep'], ['i eleni silir', 'the stars shine'], ['orco cire i taure', 'an orc cuts the forest'], ['i hesto lora', 'the captain is sleeping'], ['aiwi matar', 'birds are eating'], ['tiran i coa', 'i am watching the house'], ['i atani tuvir i eldar', 'the men find or discover the elves'], ['eldar menar', 'elves are going'], ['noras', 'he is running'], ['tiralye coanya', 'you are watching my house'], ['cirin lye', 'i cut you'], ['macilelyar silar', 'your swords are shining'], ['matalyes', 'you are eating it'], ['aranya mena', 'his king is going'], ['tuvinye i taure', 'i find the forest'], ['i elda tiret', 'the elf watches them'], ['samil macilinya', 'you have my sword'], ['i he

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

# max sentence length
def max_length(lines):
  return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  X = tokenizer.texts_to_sequences(lines)
  # pad sequences with 0 values
  X = pad_sequences(X, maxlen=length, padding= 'post')
  return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
  ylist = list()
  for sequence in sequences:
    encoded = to_categorical(sequence, num_classes=vocab_size)
    ylist.append(encoded)
  y = array(ylist)
  y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
  return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
  model = Sequential()
  model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
  model.add(LSTM(n_units))
  model.add(RepeatVector(tar_timesteps))
  model.add(LSTM(n_units, return_sequences=True))
  model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
  # compile model
  model.compile(optimizer='adam', loss='categorical_crossentropy')
  # summarize defined model
  model.summary()
  plot_model(model, to_file='model.png', show_shapes=True)
  return model

# load datasets
dataset = load_clean_sentences('quenya-english.pkl')
train = load_clean_sentences('quenya-english-train.pkl')
test = load_clean_sentences('quenya-english-test.pkl')
# prepare english tokenizer
print(dataset.dtype)
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d'  % ger_vocab_size)
print('German Max Length: %d' % (ger_length))
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
# fit model
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1,
save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY),
callbacks=[checkpoint], verbose=2)

<U77
English Vocabulary Size: 975
English Max Length: 19
German Vocabulary Size: 750
German Max Length: 18
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 18, 256)           192000    
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 repeat_vector_1 (RepeatVec  (None, 19, 256)           0         
 tor)                                                            
                                                                 
 lstm_3 (LSTM)               (None, 19, 256)           525312    
                                                                 
 time_distributed_1 (TimeDi  (None, 19, 975)           250575    
 stributed)                                                      
             

  saving_api.save_model(



Epoch 2: val_loss improved from 5.14811 to 2.35569, saving model to model.h5
7/7 - 4s - loss: 3.1139 - val_loss: 2.3557 - 4s/epoch - 550ms/step
Epoch 3/30

Epoch 3: val_loss improved from 2.35569 to 2.29965, saving model to model.h5
7/7 - 3s - loss: 2.0957 - val_loss: 2.2997 - 3s/epoch - 415ms/step
Epoch 4/30

Epoch 4: val_loss improved from 2.29965 to 2.11275, saving model to model.h5
7/7 - 3s - loss: 1.8204 - val_loss: 2.1127 - 3s/epoch - 368ms/step
Epoch 5/30

Epoch 5: val_loss improved from 2.11275 to 2.05911, saving model to model.h5
7/7 - 3s - loss: 1.7055 - val_loss: 2.0591 - 3s/epoch - 363ms/step
Epoch 6/30

Epoch 6: val_loss improved from 2.05911 to 2.00834, saving model to model.h5
7/7 - 2s - loss: 1.6180 - val_loss: 2.0083 - 2s/epoch - 355ms/step
Epoch 7/30

Epoch 7: val_loss improved from 2.00834 to 1.97169, saving model to model.h5
7/7 - 3s - loss: 1.5493 - val_loss: 1.9717 - 3s/epoch - 466ms/step
Epoch 8/30

Epoch 8: val_loss improved from 1.97169 to 1.92550, saving mode

<keras.src.callbacks.History at 0x7a78f78ef1f0>