In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import re
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, RepeatVector, TimeDistributed, Dense, Concatenate, AdditiveAttention
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from pickle import dump
import tensorflow as tf

# 1. Data Cleaning
def clean_text(text, is_sinhala=False):
    if not is_sinhala:
        text = text.lower()
        text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"\s+", " ", text).strip()
    else:
        text = text.strip()
    return text

# 2. Load and Clean Data
print("Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/Nlp/cleaned_dataset (1).csv')  # Update path as needed
df = df[['English', 'Sinhala', 'Singlish']].dropna().astype(str)

print("Cleaning data...")
cleaned = [
    [clean_text(row['English']), clean_text(row['Sinhala'], True), clean_text(row['Singlish'])]
    for _, row in df.iterrows()
]
text = np.array(cleaned)

# Shuffle and split
np.random.seed(42)
np.random.shuffle(text)
split = int(len(text) * 0.9)
train, test = text[:split], text[split:]

# 3. Tokenization & Sequence Preparation
def create_tokenizer(lines):
    tokenizer = Tokenizer(oov_token="<UNK>", filters='')
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

eng_tokenizer = create_tokenizer(text[:, 0])
sinhala_tokenizer = create_tokenizer(text[:, 1])
singlish_tokenizer = create_tokenizer(text[:, 2])

eng_vocab_size = len(eng_tokenizer.word_index) + 1
sinhala_vocab_size = len(sinhala_tokenizer.word_index) + 1
singlish_vocab_size = len(singlish_tokenizer.word_index) + 1

eng_max_length = max_length(text[:, 0])
sinhala_max_length = max_length(text[:, 1])
singlish_max_length = max_length(text[:, 2])

def encode_sequences(tokenizer, max_length, lines):
    X = tokenizer.texts_to_sequences(lines)
    return pad_sequences(X, maxlen=max_length, padding='post')

trainX = encode_sequences(eng_tokenizer, eng_max_length, train[:, 0])
train_sinY = encode_sequences(sinhala_tokenizer, sinhala_max_length, train[:, 1])
train_singY = encode_sequences(singlish_tokenizer, singlish_max_length, train[:, 2])

testX = encode_sequences(eng_tokenizer, eng_max_length, test[:, 0])
test_sinY = encode_sequences(sinhala_tokenizer, sinhala_max_length, test[:, 1])
test_singY = encode_sequences(singlish_tokenizer, singlish_max_length, test[:, 2])

# 4. Model Definition with Additive Attention
def define_multitask_model(src_vocab, sin_vocab, sing_vocab, src_len, sin_len, sing_len, n_units):
    # Encoder
    encoder_inputs = Input(shape=(src_len,))
    enc_emb = Embedding(src_vocab, n_units)(encoder_inputs)
    encoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
    encoder_proj = TimeDistributed(Dense(n_units))(encoder_outputs)

    # Sinhala Decoder
    sin_repeat = RepeatVector(sin_len)(state_h)
    sin_decoder_lstm = LSTM(n_units, return_sequences=True)
    sin_decoder_outputs = sin_decoder_lstm(sin_repeat, initial_state=[state_h, state_c])
    sin_decoder_proj = TimeDistributed(Dense(n_units))(sin_decoder_outputs)
    sin_attention = AdditiveAttention(use_scale=True)([sin_decoder_proj, encoder_proj])  # Fixed here
    sin_concat = Concatenate()([sin_decoder_outputs, sin_attention])
    sin_out = TimeDistributed(Dense(sin_vocab, activation='softmax'), name="sinhala_output")(sin_concat)

    # Singlish Decoder
    sing_repeat = RepeatVector(sing_len)(state_h)
    sing_decoder_lstm = LSTM(n_units, return_sequences=True)
    sing_decoder_outputs = sing_decoder_lstm(sing_repeat, initial_state=[state_h, state_c])
    sing_decoder_proj = TimeDistributed(Dense(n_units))(sing_decoder_outputs)
    sing_attention = AdditiveAttention(use_scale=True)([sing_decoder_proj, encoder_proj])  # Fixed here
    sing_concat = Concatenate()([sing_decoder_outputs, sing_attention])
    sing_out = TimeDistributed(Dense(sing_vocab, activation='softmax'), name="singlish_output")(sing_concat)

    model = Model(inputs=encoder_inputs, outputs=[sin_out, sing_out])
    model.compile(
        optimizer='adam',
        loss={'sinhala_output': 'sparse_categorical_crossentropy',
              'singlish_output': 'sparse_categorical_crossentropy'},
        metrics={'sinhala_output': 'accuracy', 'singlish_output': 'accuracy'}
    )

    return model


print("Building model...")
model = define_multitask_model(
    eng_vocab_size, sinhala_vocab_size, singlish_vocab_size,
    eng_max_length, sinhala_max_length, singlish_max_length, 256
)

# 5. Training
callbacks = [
    ModelCheckpoint('multitask_nmt_best.h5', monitor='val_loss', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_loss', patience=4, verbose=1, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, verbose=1)
]

print("Training model...")
history = model.fit(
    trainX,
    {'sinhala_output': train_sinY, 'singlish_output': train_singY},
    validation_data=(testX, {'sinhala_output': test_sinY, 'singlish_output': test_singY}),
    epochs=30,
    batch_size=64,
    callbacks=callbacks
)

# 6. Save Model and Tokenizers
print("Saving model and tokenizers...")
model.save('multitask_nmt_final.h5')
dump(eng_tokenizer, open('eng_tokenizer.pkl', 'wb'))
dump(sinhala_tokenizer, open('sinhala_tokenizer.pkl', 'wb'))
dump(singlish_tokenizer, open('singlish_tokenizer.pkl', 'wb'))

# 7. Prediction with Confidence Score
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_sequence_with_confidence(model, source, target_tokenizer, output_name):
    prediction = model.predict(source, verbose=0)[0 if output_name == 'sinhala_output' else 1][0]
    integers = [np.argmax(vector) for vector in prediction]
    probs = [np.max(vector) for vector in prediction]
    words = [word_for_id(i, target_tokenizer) for i in integers if word_for_id(i, target_tokenizer)]
    confidence = np.mean(probs)
    return ' '.join(words), confidence

# Example
example_eng = ['how are you']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)


Loading dataset...
Cleaning data...
Building model...
Training model...
Epoch 1/30
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436ms/step - loss: 4.6171 - singlish_output_accuracy: 0.7948 - singlish_output_loss: 2.2069 - sinhala_output_accuracy: 0.7879 - sinhala_output_loss: 2.4102
Epoch 1: val_loss improved from inf to 3.12991, saving model to multitask_nmt_best.h5
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 470ms/step - loss: 4.6148 - singlish_output_accuracy: 0.7948 - singlish_output_loss: 2.2058 - sinhala_output_accuracy: 0.7879 - sinhala_output_loss: 2.4090 - val_loss: 3.1299 - val_singlish_output_accuracy: 0.8061 - val_singlish_output_loss: 1.4996 - val_sinhala_output_accuracy: 0.7993 - val_sinhala_output_loss: 1.6294 - learning_rate: 0.0010
Epoch 2/30
[1m484/484[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 436ms/step - loss: 3.0730 - singlish_output_accuracy: 0.8071 - singlish_output_loss: 1.4768 - sinhala_output_accuracy: 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
example_eng = ['i love you']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: i love you
Sinhala Prediction: මම ඔයාට ආදරෙයි
Sinhala Confidence: 0.98447174
Singlish Prediction: mama oyata adareyi
Singlish Confidence: 0.9857285


In [3]:
from tensorflow.keras.models import load_model
from pickle import load

# Load the trained model
model = load_model('/content/drive/MyDrive/Nlp/multitask_nmt_final.h5')

# Load the tokenizers
with open('/content/drive/MyDrive/Nlp/eng_tokenizer.pkl', 'rb') as f:
    eng_tokenizer = load(f)
with open('/content/drive/MyDrive/Nlp/sinhala_tokenizer.pkl', 'rb') as f:
    sinhala_tokenizer = load(f)
with open('/content/drive/MyDrive/Nlp/singlish_tokenizer.pkl', 'rb') as f:
    singlish_tokenizer = load(f)



In [4]:
import re
import string
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def clean_text(text, is_sinhala=False):
    if not is_sinhala:
        text = text.lower()
        text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"\s+", " ", text).strip()
    else:
        text = text.strip()
    return text

def encode_sequences(tokenizer, max_length, lines):
    X = tokenizer.texts_to_sequences(lines)
    return pad_sequences(X, maxlen=max_length, padding='post')

In [7]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

def predict_sequence_with_confidence(model, source, target_tokenizer, output_name):
    prediction = model.predict(source, verbose=0)[0 if output_name == 'sinhala_output' else 1][0]
    integers = [np.argmax(vector) for vector in prediction]
    probs = [np.max(vector) for vector in prediction]
    words = [word_for_id(i, target_tokenizer) for i in integers if word_for_id(i, target_tokenizer)]
    confidence = np.mean(probs)
    return ' '.join(words), confidence

# Example usage:
example_eng = ['let s see']
example_eng_clean = [clean_text(s) for s in example_eng]
eng_max_length = model.input_shape[1]  # Or use the value you used during training
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng_clean)

sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)


English: let s see
Sinhala Prediction: අපි බලමු
Sinhala Confidence: 0.9850395
Singlish Prediction: api balamu
Singlish Confidence: 0.9888362


In [8]:
example_eng = ['she already know']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: she already know
Sinhala Prediction: ඇය දැනටමත් දන්නවා
Sinhala Confidence: 0.9875726
Singlish Prediction: eya denatamath dannawa
Singlish Confidence: 0.9853925


In [None]:
example_eng = ['now this']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: now this
Sinhala Prediction: දැන් මේක
Sinhala Confidence: 0.9944297
Singlish Prediction: den meka
Singlish Confidence: 0.99373496


In [None]:
example_eng = ['i know']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: i know
Sinhala Prediction: මම දන්නවා
Sinhala Confidence: 0.99402505
Singlish Prediction: mama dannawa
Singlish Confidence: 0.995863


In [9]:
example_eng = ['my home']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: my home
Sinhala Prediction: මගේ ගෙදර
Sinhala Confidence: 0.9987499
Singlish Prediction: mage gedara
Singlish Confidence: 0.9980909


In [10]:
example_eng = ['let s see ']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: let s see 
Sinhala Prediction: අපි බලමු
Sinhala Confidence: 0.9850395
Singlish Prediction: api balamu
Singlish Confidence: 0.9888362


In [None]:
example_eng = ['i know']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: i know
Sinhala Prediction: මම දන්නවා
Sinhala Confidence: 0.99402505
Singlish Prediction: mama dannawa
Singlish Confidence: 0.995863


In [11]:
example_eng = ['her dog']
source = encode_sequences(eng_tokenizer, eng_max_length, example_eng)
sin_pred, sin_conf = predict_sequence_with_confidence(model, source, sinhala_tokenizer, 'sinhala_output')
sing_pred, sing_conf = predict_sequence_with_confidence(model, source, singlish_tokenizer, 'singlish_output')

print("English:", example_eng[0])
print("Sinhala Prediction:", sin_pred)
print("Sinhala Confidence:", sin_conf)
print("Singlish Prediction:", sing_pred)
print("Singlish Confidence:", sing_conf)

English: her dog
Sinhala Prediction: ඇගේ බල්ලා
Sinhala Confidence: 0.99123794
Singlish Prediction: ege balla
Singlish Confidence: 0.99605316
