In [None]:
## Importing neccessary libraries

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:


# Initial model testing data 
# data = [
#     {'english': 'I am fine, and you?', 'akan': 'Me ho ye'},
#     {'english': 'Good morning', 'akan': 'Mema wo akye'},
#     {'english': 'Thank you', 'akan': 'Medase'},
#     {'english': 'What is your name?', 'akan': 'Wo din de sen?'},
# ]
# current data - 25,000 parallel sentences
data = pd.read_json('dataset.json')

# Extracting sentences into respective variables
english_texts = [item['english'] for item in data]
akan_texts = [item['akan'] for item in data]

# English Sentences Tokenization
eng_tokenizer = Tokenizer(filters='')
eng_tokenizer.fit_on_texts(english_texts)
eng_sequences = eng_tokenizer.texts_to_sequences(english_texts)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_padded = pad_sequences(eng_sequences, padding='post')

# Akan Sentences Tokenization
akan_tokenizer = Tokenizer(filters='')
akan_tokenizer.fit_on_texts(akan_texts)
akan_sequences = akan_tokenizer.texts_to_sequences(akan_texts)
akan_vocab_size = len(akan_tokenizer.word_index) + 1
akan_padded = pad_sequences(akan_sequences, padding='post')

# Training, Testing & Validation Split
X_train, X_test, y_train, y_test = train_test_split(eng_padded, akan_padded, test_size=0.2, random_state=42)

# Model Parameters
embedding_dim = 64
units = 128

# Model Layers (Encoder-Decoder Layers) - return_sequences and return_state
encoder_inputs = tf.keras.Input(shape=(None,))
encoder_embedding = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
encoder_gru = GRU(units, return_sequences=False, return_state=True)
encoder_outputs, encoder_state = encoder_gru(encoder_embedding)  # Ensure encoder_state is returned

decoder_inputs = tf.keras.Input(shape=(None,))
decoder_embedding = Embedding(akan_vocab_size, embedding_dim)(decoder_inputs)
decoder_gru = GRU(units, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_state)  # Pass encoder_state
decoder_dense = Dense(akan_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Compiling Model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model Training
model.fit([X_train, y_train[:, :-1]], y_train[:, 1:], epochs=50, batch_size=32, validation_data=([X_test, y_test[:, :-1]], y_test[:, 1:]))

# Saving Model
model.save('english_to_akan_gru.h5')




Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Model Summary - To be sure the layers are arranged in the exact order we want it
model.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 64)     896         ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, None, 64)     704         ['input_6[0][0]']                
                                                                                            

#### Model Inference 

In [None]:


def load_encoder_decoder(model, units):
    """Extract encoder and decoder models for inference."""
    
    # Encoder Layer
    encoder_inputs = model.input[0]  # English input
    encoder_embedding = model.layers[1](encoder_inputs)
    encoder_outputs, encoder_state = model.layers[2](encoder_embedding)
    encoder_model = tf.keras.Model(encoder_inputs, encoder_state)

    # Decoder Layer
    decoder_inputs = tf.keras.Input(shape=(None,))
    decoder_state_input = tf.keras.Input(shape=(units,))

    decoder_embedding = model.layers[3](decoder_inputs)
    decoder_gru = model.layers[4]
    
    decoder_outputs, decoder_state = decoder_gru(decoder_embedding, initial_state=decoder_state_input)
    decoder_dense = model.layers[5]
    decoder_outputs = decoder_dense(decoder_outputs)

    decoder_model = tf.keras.Model([decoder_inputs, decoder_state_input], [decoder_outputs, decoder_state])

    return encoder_model, decoder_model

def preprocess_sentence(sentence, tokenizer, max_len):
    sequence = tokenizer.texts_to_sequences([sentence])
    return pad_sequences(sequence, maxlen=max_len, padding='post')

def translate_sentence(encoder_model, decoder_model, sentence, eng_tokenizer, akan_tokenizer, max_len):    
    #  Input Preprocesing
    encoder_input = preprocess_sentence(sentence, eng_tokenizer, max_len)

    # Encoding input sentence
    encoder_state = encoder_model.predict(encoder_input)

    # Start decoding with "<start>" token if applicable, else use 1
    decoder_input = np.zeros((1, 1))
    decoder_input[0, 0] = akan_tokenizer.word_index.get('<start>', 1)

    decoded_sentence = []

    for _ in range(max_len):
        output_tokens, state = decoder_model.predict([decoder_input, encoder_state])
        predicted_id = np.argmax(output_tokens[0, -1, :])
        
        # Stop at <end> token
        if predicted_id == 0:  
            break
        
        word = akan_tokenizer.index_word.get(predicted_id, '')
        decoded_sentence.append(word)
        
        # Updating decoder input for next iteration
        decoder_input[0, 0] = predicted_id
        
        # Updating the state
        encoder_state = state  
    
    return ' '.join(decoded_sentence)

# Loading the trained model
model = tf.keras.models.load_model('english_to_akan_gru.h5')

# Extracting encoder and decoder
encoder_model, decoder_model = load_encoder_decoder(model, units=128)

# Test translation
english_sentence = "What is your name?"
akan_translation = translate_sentence(encoder_model, decoder_model, english_sentence, eng_tokenizer, akan_tokenizer, max_len=10)

print(f"English: {english_sentence}")
print(f"Akan: {akan_translation}")
