# Library Imports

In [1]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

# Read Data

In [2]:
# Path to translation file
path_to_data = 'C:\\Users\\tessw\\OneDrive\\Documents\\University\\Honours\\hau.txt'

# Read file
translation_file = open(path_to_data, "r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in  raw_data]

# Text Pre-processing

In [3]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "!" + '?'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [4]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [5]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
hausa_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
hau_text_tokenized, hau_text_tokenizer = tokenize(hausa_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length hausa sentence: {}'.format(len(max(hau_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

# Check language length
hausa_vocab = len(hau_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Hausa vocabulary is of {} unique words".format(hausa_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Maximum length hausa sentence: 89
Maximum length english sentence: 72
Hausa vocabulary is of 1014 unique words
English vocabulary is of 977 unique words


In [6]:
max_hausa_len = int(len(max(hau_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

hau_pad_sentence = pad_sequences(hau_text_tokenized, max_hausa_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
hau_pad_sentence = hau_pad_sentence.reshape(*hau_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

# Model Layers

In [7]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)

In [8]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)

In [9]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)

In [10]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)

In [11]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)

# Model Summary

In [12]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 89)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 89, 128)           129792    
                                                                 
 lstm_4 (LSTM)               (None, 64)                49408     
                                                                 
 repeat_vector_2 (RepeatVect  (None, 72, 64)           0         
 or)                                                             
                                                                 
 lstm_5 (LSTM)               (None, 72, 64)            33024     
                                                                 
 time_distributed (TimeDistr  (None, 72, 977)          63505     
 ibuted)                                                     

# Run Model

In [55]:
model_results = enc_dec_model.fit(hau_pad_sentence, eng_pad_sentence, batch_size=70, epochs=30)

In [28]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

# Output Methods

In [46]:
index = 1

def eng_to_hau():
    print()
    print("The hausa sentence is: {}".format(hausa_sentences[index]))
    print()
    print('The predicted sentence is :')
    print(logits_to_sentence(enc_dec_model.predict(eng_pad_sentence[index:index+1])[0], hau_text_tokenizer))
    print()
    print("The english sentence is: {}".format(english_sentences[index]))
    
def hau_to_eng():
    print("The hausa sentence is: {}".format(hausa_sentences[index]))
    print()
    print('The predicted sentence is :')
    print(logits_to_sentence(enc_dec_model.predict(hau_pad_sentence[index:index+1])[0], eng_text_tokenizer))
    print()
    print("The english sentence is: {}".format(english_sentences[index]))

# Translation Output

In [53]:
print("Do you want to translate Eng-Hau (enter E) or Hau-Eng? (enter H)")
choice = input()

if choice == 'E' or choice == 'e':
    eng_to_hau()
elif choice == 'H' or choice == 'h':
    hau_to_eng()
else:
    print("Invalid choice, please enter E or H.")

Do you want to translate Eng-Hau (enter E) or Hau-Eng? (enter H)
h
The hausa sentence is: new delhi ap  firayim ministan indiya narendra modi zai halarci bikin aza harsashin ginin wurin bautar hindu a watan mai zuwa a wani wurin da ake takaddama da ke arewacin indiya a inda a da wannan wurin wania masallaci ne aka rugurguza shi karni na 16 wanda wasu masu tsattsauran raayi na hindu su ka yi a 1992 kamar yadda aka ji daga masu amintattu daaka dorawa alhakkin kula da ginin

The predicted sentence is :
the delhi ap indian prime minister narendra modi greece attend the groundbreaking groundbreaking next month the and on on on a disputed the the northern india where a 16th century century was torn down to hindu hardliners 1992 1992 according the the the overseeing the construction construction                         

The english sentence is: new delhi ap  indian prime minister narendra modi will attend a groundbreaking ceremony next month for a hindu temple on a disputed site in northern 