In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Loading dataset as a pandas df
df = pd.read_json('dataset/final_data.json')

# Adding start and end tokens
df["akan"] = "<start> " + df["akan"] + " <end>" 

# Word Level Tokenization 
max_vocab_size = 1000000
tokenizer_english = Tokenizer(num_words=max_vocab_size, filters='') #English is source
tokenizer_akan = Tokenizer(num_words=max_vocab_size, filters='')   #Akan is target

#Fitting Tokenizer on English
tokenizer_english.fit_on_texts(df["english"]) 


# Filling NaN with empty strings
df['akan'] = df['akan'].fillna('')  
# Convert non-string values to strings
df['akan'] = df['akan'].astype(str)  

#Fitting Tokenizer on Akan
tokenizer_akan.fit_on_texts(df["akan"])       

# Converting texts to sequences
english_sequences = tokenizer_english.texts_to_sequences(df["english"])
akan_sequences = tokenizer_akan.texts_to_sequences(df["akan"])

# Post Padding to ensure equal sequence length / dimensions
max_english_length = max(len(seq) for seq in english_sequences)
max_akan_length = max(len(seq) for seq in akan_sequences)

english_padded = pad_sequences(english_sequences, maxlen=max_english_length, padding="post")
akan_padded = pad_sequences(akan_sequences, maxlen=max_akan_length, padding="post")

# Vocabulary sizes
english_vocab_size = len(tokenizer_english.word_index) + 1
akan_vocab_size = len(tokenizer_akan.word_index) + 1

# Model Hyperparameters
embedding_dim = 256
lstm_units = 512

# Encoder (Takes English as input)
encoder_inputs = Input(shape=(max_english_length,))
encoder_embedding = Embedding(input_dim=english_vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder (Generates Akan)
decoder_inputs = Input(shape=(max_akan_length,))
decoder_embedding = Embedding(input_dim=akan_vocab_size, output_dim=embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(akan_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Shifting target sequences for training 
akan_padded_target = np.zeros_like(akan_padded)
akan_padded_target[:, :-1] = akan_padded[:, 1:]

# Train Model
batch_size = 32
epochs = 3




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1585)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1585, 256)    17671936    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 32, 256)      2770176     ['input_2[0][0]']                
                                                                                              

In [None]:
model.fit(
    [english_padded, akan_padded],  # English input, Akan target
    akan_padded_target,            # Akan target shifted
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2
)


In [3]:
### Saving the Tokenizers and the Model
import pickle

with open("tokenizer_akan.pkl", "wb") as f:
    pickle.dump(tokenizer_akan, f)

with open("tokenizer_english.pkl", "wb") as f:
    pickle.dump(tokenizer_english, f)


model.save('model.keras')   