# Project Description

This project aims to use transformer-based NLP models to accurately translate text between separate languages - namely English, Vietnamese and Indonesian

In [None]:
import pandas as pd
import numpy as np
import re
from unicodedata import normalize
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Masking
from keras_nlp.layers import TokenAndPositionEmbedding, TransformerEncoder, TransformerDecoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import joblib

Using TensorFlow backend


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Import

This dataset, taken from kaggle: [English and Indonesian subtitles](https://www.kaggle.com/datasets/greegtitan/english-indonesia-movie-subtitles), representing episode 13 of a tv series entitled: ""

## Column names

1. **id** = the Indonesian translation of subtitle
2. **en** = English subtitle

In [None]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/english_indonesian_subtitles.csv")

# Data cleaning

- data types
- duplicates
- missing values
- unique values
- erroeneous values


In [None]:
# Drop the first row and reset index
df.drop([0], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.drop_duplicates(inplace=True)

# Cleaning text
print("Cleaning text...\n")

def clean_text(text):
    if isinstance(text, str):
        text = normalize('NFD', text.lower())
        text = re.sub('[^A-Za-z ]+', '', text)
    return text

def clean_and_prepare_text(text):
    text = '[start] ' + clean_text(text) + ' [end]'
    return text if isinstance(text, str) else ''  # Return an empty string if text is not a string

# Apply the cleaning functions to the DataFrame
df['id'] = df['id'].apply(clean_and_prepare_text)
df['en'] = df['en'].apply(clean_text)
df.head()

en = df['en']
indonesian = df['id']
print("\nCalculating maximum phrase length")

en_max_len = 106
id_max_len = 112
sequence_len = 112


Cleaning text...


Calculating maximum phrase length


In [None]:
print(f'Max phrase length (English): {en_max_len}')
print(f'Max phrase length (Indonesian): {id_max_len}')
print(f'Sequence length: {sequence_len}\n')

# Filter out non-string values and convert to lowercase
en_cleaned = [text.lower() for text in en if isinstance(text, str)]
id_cleaned = [text.lower() for text in indonesian if isinstance(text, str)]

# Tokenize and pad sequences
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en_cleaned)
en_sequences = en_tokenizer.texts_to_sequences(en_cleaned)
en_x = pad_sequences(en_sequences, maxlen=sequence_len, padding='post')

id_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n')
id_tokenizer.fit_on_texts(id_cleaned)
id_sequences = id_tokenizer.texts_to_sequences(id_cleaned)
id_y = pad_sequences(id_sequences, maxlen=sequence_len + 1, padding='post')

# Calculate the vocabulary sizes from the tokenizer instances
en_vocab_size = len(en_tokenizer.word_index) + 1
id_vocab_size = len(id_tokenizer.word_index) + 1

print("Calculating vocabulary size")
print(f'Vocabulary size (English): {en_vocab_size}')
print(f'Vocabulary size (Indonesian): {id_vocab_size}\n')


Max phrase length (English): 106
Max phrase length (Indonesian): 112
Sequence length: 112

Calculating vocabulary size
Vocabulary size (English): 272792
Vocabulary size (Indonesian): 359908



In [None]:
print("Quickly clearing RAM\n")
gc.collect()

# Define model parameters
num_heads = 1
embed_dim = 10

# Define the model architecture
encoder_input = Input(shape=(None,), dtype='int64', name='encoder_input')
decoder_input = Input(shape=(None,), dtype='int64', name='decoder_input')

# Token and Position Embedding layer should be defined elsewhere in the code
x_enc = TokenAndPositionEmbedding(en_vocab_size, sequence_len, embed_dim)(encoder_input)
encoder_output = TransformerEncoder(embed_dim, num_heads)(x_enc)

# Adding Masking to handle padded sequences
x_dec = TokenAndPositionEmbedding(id_vocab_size, sequence_len, embed_dim)(decoder_input)
masked_x_dec = Masking(mask_value=0)(x_dec)

x_dec = TransformerDecoder(embed_dim, num_heads)(masked_x_dec, encoder_output)
x_dec = Dropout(0.4)(x_dec)

decoder_output = Dense(id_vocab_size, activation='softmax')(x_dec)

decoder = Model([decoder_input, encoder_output], decoder_output)
decoder_output = decoder([decoder_input, encoder_output])

model = Model([encoder_input, decoder_input], decoder_output)

# Define the SGD optimizer with a reasonable learning rate
sgd_optimizer = tf.keras.optimizers.SGD(learning_rate=0.99)
model.compile(optimizer=sgd_optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary(line_length=120)

# Assuming en_x and id_y are already defined
inputs = {'encoder_input': en_x, 'decoder_input': id_y[:, :-1]}
outputs = id_y[:, 1:]

batch_size = 50  # Choose an appropriate batch size
steps_per_epoch = len(en_x) // batch_size

# Fit the model with steps_per_epoch
callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
hist = model.fit(inputs, outputs, epochs=1, validation_split=0.2, callbacks=[callback], batch_size=batch_size)

# Saving the model
model.save("english_indonesian_translate_reduced.h5")



Quickly clearing RAM

Model: "model_1"
________________________________________________________________________________________________________________________
 Layer (type)                       Output Shape                        Param #     Connected to                        
 encoder_input (InputLayer)         [(None, None)]                      0           []                                  
                                                                                                                        
 token_and_position_embedding (Tok  (None, None, 10)                    2729040     ['encoder_input[0][0]']             
 enAndPositionEmbedding)                                                                                                
                                                                                                                        
 decoder_input (InputLayer)         [(None, None)]                      0           []                            