In [1]:
import string
import numpy as np

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
# Path to translation file
path_to_data = 'C:\\Users\\tessw\\OneDrive\\Documents\\University\\Honours\\hau.txt'

# Read file
translation_file = open(path_to_data, "r", encoding='utf-8') 
raw_data = translation_file.read()
translation_file.close()

# Parse data
raw_data = raw_data.split('\n')
pairs = [sentence.split('\t') for sentence in  raw_data]

In [3]:
def clean_sentence(sentence):
    # Lower case the sentence
    lower_case_sent = sentence.lower()
    # Strip punctuation
    string_punctuation = string.punctuation + "!" + '?'
    clean_sentence = lower_case_sent.translate(str.maketrans('', '', string_punctuation))
   
    return clean_sentence

In [4]:
def tokenize(sentences):
    # Create tokenizer
    text_tokenizer = Tokenizer()
    # Fit texts
    text_tokenizer.fit_on_texts(sentences)
    return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [5]:
# Clean sentences
english_sentences = [clean_sentence(pair[0]) for pair in pairs]
hausa_sentences = [clean_sentence(pair[1]) for pair in pairs]

# Tokenize words
hau_text_tokenized, hau_text_tokenizer = tokenize(hausa_sentences)
eng_text_tokenized, eng_text_tokenizer = tokenize(english_sentences)

print('Maximum length hausa sentence: {}'.format(len(max(hau_text_tokenized,key=len))))
print('Maximum length english sentence: {}'.format(len(max(eng_text_tokenized,key=len))))

# Check language length
hausa_vocab = len(hau_text_tokenizer.word_index) + 1
english_vocab = len(eng_text_tokenizer.word_index) + 1
print("Hausa vocabulary is of {} unique words".format(hausa_vocab))
print("English vocabulary is of {} unique words".format(english_vocab))

Maximum length hausa sentence: 85
Maximum length english sentence: 66
Hausa vocabulary is of 278 unique words
English vocabulary is of 212 unique words


In [6]:
max_hausa_len = int(len(max(hau_text_tokenized,key=len)))
max_english_len = int(len(max(eng_text_tokenized,key=len)))

hau_pad_sentence = pad_sequences(hau_text_tokenized, max_hausa_len, padding = "post")
eng_pad_sentence = pad_sequences(eng_text_tokenized, max_english_len, padding = "post")

# Reshape data
hau_pad_sentence = hau_pad_sentence.reshape(*hau_pad_sentence.shape, 1)
eng_pad_sentence = eng_pad_sentence.reshape(*eng_pad_sentence.shape, 1)

In [7]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)

In [8]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)

In [9]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)

In [10]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)

In [11]:
input_sequence = Input(shape=(max_hausa_len,))
embedding = Embedding(input_dim=hausa_vocab, output_dim=128,)(input_sequence)
encoder = LSTM(64, return_sequences=False)(embedding)
r_vec = RepeatVector(max_english_len)(encoder)
decoder = LSTM(64, return_sequences=True, dropout=0.2)(r_vec)
logits = TimeDistributed(Dense(english_vocab))(decoder)

In [12]:
enc_dec_model = Model(input_sequence, Activation('softmax')(logits))
enc_dec_model.compile(loss=sparse_categorical_crossentropy,
              optimizer=Adam(1e-3),
              metrics=['accuracy'])
enc_dec_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 85)]              0         
                                                                 
 embedding_4 (Embedding)     (None, 85, 128)           35584     
                                                                 
 lstm_4 (LSTM)               (None, 64)                49408     
                                                                 
 repeat_vector_2 (RepeatVect  (None, 66, 64)           0         
 or)                                                             
                                                                 
 lstm_5 (LSTM)               (None, 66, 64)            33024     
                                                                 
 time_distributed (TimeDistr  (None, 66, 212)          13780     
 ibuted)                                                     

In [13]:
model_results = enc_dec_model.fit(hau_pad_sentence, eng_pad_sentence, batch_size=50, epochs=250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [14]:
def logits_to_sentence(logits, tokenizer):

    index_to_words = {idx: word for word, idx in tokenizer.word_index.items()}
    index_to_words[0] = '<empty>' 

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [15]:
index = 1

def eng_to_hau():
    print()
    print("The hausa sentence is: {}".format(hausa_sentences[index]))
    print()
    print('The predicted sentence is :')
    print(logits_to_sentence(enc_dec_model.predict(eng_pad_sentence[index:index+1])[0], hau_text_tokenizer))
    print()
    print("The english sentence is: {}".format(english_sentences[index]))
    
def hau_to_eng():
    print("The hausa sentence is: {}".format(hausa_sentences[index]))
    print()
    print('The predicted sentence is :')
    print(logits_to_sentence(enc_dec_model.predict(hau_pad_sentence[index:index+1])[0], eng_text_tokenizer))
    print()
    print("The english sentence is: {}".format(english_sentences[index]))

In [None]:
index = 1

def eng_to_hau_inp():
    print("Enter word or phrase to translate: ")
    user_input = input()
    input_sentence = clean_sentence(user_input)
    input_text_tokenized, input_text_tokenizer = tokenize(input_sentence)
    max_input_len = int(len(max(input_text_tokenized,key=len)))
    maxlen = len(user_input)
    input_pad_sentence = pad_sequences(input_text_tokenized, max_input_len, padding = "post")
    input_pad_sentence = input_pad_sentence.reshape(*input_pad_sentence.shape, 1)
    
    print()
    print("The english sentence is: {}".format(user_input))
    print()
    print('The predicted hausa sentence is :')
    print(logits_to_sentence(enc_dec_model.predict(input_pad_sentence[index:index+1])[0], hau_text_tokenizer))
    
def hau_to_eng_inp():
    print("Enter word or phrase to translate: ")
    user_input = input()
    input_sentence = clean_sentence(user_input)
    input_text_tokenized, input_text_tokenizer = tokenize(input_sentence)
    max_input_len = int(len(max(input_text_tokenized,key=len)))
    maxlen = len(user_input)
    input_pad_sentence = pad_sequences(input_text_tokenized, max_input_len, padding = "post")
    input_pad_sentence = input_pad_sentence.reshape(*input_pad_sentence.shape, 1)
    
    print()
    print("The hausa sentence is: {}".format(user_input))
    print()
    print('The predicted english sentence is :')
    print(logits_to_sentence(enc_dec_model.predict(input_pad_sentence[index:index+1])[0], eng_text_tokenizer))

In [17]:
print("Do you want to translate Eng-Hau (enter E) or Hau-Eng? (enter H)")
choice = input()

if choice == 'E' or choice == 'e':
    eng_to_hau()
    #eng_to_hau_inp()
elif choice == 'H' or choice == 'h':
    hau_to_eng()
    #hau_to_eng_inp()
else:
    print("Invalid choice, please enter E or H.")

Do you want to translate Eng-Hau (enter E) or Hau-Eng? (enter H)
e

The hausa sentence is: new delhi ap  firayim ministan indiya narendra modi zai halarci bikin aza harsashin ginin wurin bautar hindu a watan mai zuwa a wani wurin da ake takaddama da ke arewacin indiya a inda a da wannan wurin wania masallaci ne aka rugurguza shi karni na 16 wanda wasu masu tsattsauran raayi na hindu su ka yi a 1992 kamar yadda aka ji daga masu amintattu daaka dorawa alhakkin kula da ginin

The predicted sentence is :
da da da da da da da da da da da da da da da da na na na na na na na na na da da da da da <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty> <empty>

The english sentence is: new delhi ap  indian prime minister narendra modi will attend a groundbreaking ceremony ne