In [1]:
from google.colab import drive #connecting google drive with this file
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pathlib
import random
import string
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

In [8]:
filePath = "/content/drive/MyDrive/Text to Text/bangla-chakma.csv" # path of the bangla-chakma csv file

dataset = pd.read_csv(filePath)

print(dataset['bangla'].head())
print(dataset['chakma'].head())
print(len(dataset['bangla']),len(dataset['chakma']))

text_pairs = []

for i in range(len(dataset['bangla'])):
    text_pairs.append((dataset['bangla'][i],"[start] " + dataset['chakma'][i] + " [end]")) # adding bangla and chakma sentences on text_pairs list
    #[start] abd [end] token added to target sentences to identify sentence's start and end position

text_pairs[:5]


0    তোমার মা তোমার স্কুলের ব্যাপারে আসলেই অনেক চিন...
1                                    স্যার গেট খুলুন !
2                  এটা ভীষণ ঠাণ্ডা , সাঁতার কাটো রোস !
3                      আর কোটটা আমি তাকে দিয়ে দিয়েছি ।
4                           আমারা অনেক দেরি হয়ে গেছে ।
Name: bangla, dtype: object
0     ত মামা ত ইস্কুুল অ  ব্যাপারে বজমান চিন্তিত এল...
1                                  স্যার,দরজা আন হুল ।
2                                       ইয়ান যদবদে ঈন।
3                      আর সিলুম্মু মুই তারে দি এচ্চোং।
4                                 মিরে বজমান দিরি উয়ে।
Name: chakma, dtype: object
8862 8862


[('তোমার মা তোমার স্কুলের ব্যাপারে আসলেই অনেক চিন্তিত ছিল, বাবা।',
  '[start]  ত মামা ত ইস্কুুল অ  ব্যাপারে বজমান চিন্তিত এলহ,  পুত। [end]'),
 ('স্যার গেট খুলুন !', '[start] স্যার,দরজা আন হুল । [end]'),
 ('এটা ভীষণ ঠাণ্ডা , সাঁতার কাটো রোস !', '[start] ইয়ান যদবদে ঈন। [end]'),
 ('আর কোটটা আমি তাকে দিয়ে দিয়েছি ।',
  '[start] আর সিলুম্মু মুই তারে দি এচ্চোং। [end]'),
 ('আমারা অনেক দেরি হয়ে গেছে ।', '[start] মিরে বজমান দিরি উয়ে। [end]')]

In [9]:
# spliting train,test and validation pairs
random.shuffle(text_pairs)
num_val_samples = int(0.05 * len(text_pairs)) # 0.05 means the 5% of data
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

8862 total pairs
7976 training pairs
443 validation pairs
443 test pairs


In [10]:
# this cell is for text vectorization

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 64


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")


source_vectorization = TextVectorization(
    max_tokens=vocab_size, output_mode="int", output_sequence_length=sequence_length,
)
target_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

train_source_texts = [pair[0] for pair in train_pairs]
train_target_texts = [pair[1] for pair in train_pairs]

source_vectorization.adapt(train_source_texts)
target_vectorization.adapt(train_target_texts)

print(train_source_texts[:3],train_target_texts[:3])

['সেই সময়গুলোতে তোমাকে অতিরিক্ত সতর্ক থাকতে হবে । বাছা।', 'আমি পরে আসব !', 'এখানেই থেকো।'] ['[start] সে সময় আনিত তরে অতিরিক্ত সতর্ক তা পুরিবু, চিজি। [end]', '[start] মুই পরে এম । [end]', '[start] ইয়োত থেজ। [end]']


In [11]:

def format_dataset(source, target):
    source = source_vectorization(source)
    target = target_vectorization(target)
    return ({"encoder_inputs": source, "decoder_inputs": target[:, :-1],}, target[:, 1:])


def make_dataset(pairs):
    source_texts, target_texts = zip(*pairs)
    source_texts = list(source_texts)
    target_texts = list(target_texts)
    dataset = tf.data.Dataset.from_tensor_slices((source_texts, target_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [12]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 20)
inputs["decoder_inputs"].shape: (64, 20)
targets.shape: (64, 20)


In [13]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })
        return config


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "latent_dim": self.latent_dim,
            "num_heads": self.num_heads,
        })
        return config


In [14]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [15]:
epochs = 30

transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7a4803b00490>

In [16]:
target_vocab = target_vectorization.get_vocabulary()
target_index_lookup = dict(zip(range(len(target_vocab)), target_vocab))
max_decoded_sentence_length = 20


def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = target_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


predictions = []
references = [[]]
test_source_texts = [pair[0] for pair in test_pairs]
test_target_texts = [pair[1] for pair in test_pairs]

for _ in range(len(test_target_texts)):
    i = np.random.randint(0,len(test_source_texts)-1)
    input_sentence = test_source_texts[i]
    refs = test_target_texts[i]
    translated = decode_sequence(input_sentence)

    translated = translated.replace("[start]","")
    translated = translated.replace("[end]","")

    refs = refs.replace("[start]","")
    refs = refs.replace("[end]","")
    refs = [refs]

    references.append(refs)
    predictions.append(translated)

In [17]:
references.pop(0)
print(len(references) , len(predictions))

443 443


In [18]:
for i in range(10):
    input = test_source_texts[i]
    output = decode_sequence(input).replace("[start]","")
    output = output.replace("[end]","")
    print("input:" + input,"\toutput:" + output)

input:গাড়ি ঠিক মতো ধর ! টল! 	output: গাড়ি ঠিক দক্কেন ধর অনা ঠিক ধর ধর ধর অনা ধর অনা ঠিক ধর ঠিক ধর অয়। 
input:কিন্তু শত্রুপক্ষ আগেই তাকে পেয়ে গেছে। জানিনা । কতক্ষণ ওকে নির্যাতন করেছে ওরা। 	output: হিন্তু তাত্তুনও আগে তারে নিনেই বেত নপাঙ তারা কাবোরচুগোরয়ানি তারে থোগেনেই দিবে। 
input:তো  ? আপনার কোনটাতে আগ্রহ হল  ? Mr ওয়েন? 	output: মুই তরে সারাজীবন হঙ অত্তে mr ওয়েন ওয়েন ওয়েন ওয়েন গোথামের শাহজাদা। 
input:আমি এরকম এক উষ্ণ কেন্দ্র ছিলাম যার চারপাশে এই পার্থিব জীবন শুরু হয়েছে। 	output: মুই কিজু নকনা উচিৎ। 
input:এটা হচ্ছে তোমার চেনা বিশ্ব। 	output: ইয়ান ন অর তুই জমে 
input:ভুল । লেফটেনেন্ট । আপনার লোকেরা সবাই শেষ ! 	output: ভুল মুই লেফটেনেন্ট তরে বানা । 
input:যারা এখানে বসবাস করত মনে হয় । তারা অনেক অনেক আগে মারা গিয়েছে। 	output: যারা ইয়ুত মর গত্তাক মনত ন অয় তুই বজমান বজমান আগে মরঙর যেয়ে। 
input:কিন্তু দ্বিতীয় নাস্তা? 	output: হিন্তু সিয়ান নাস্তা 
input:থামো! 	output: থাম 
input:কোথাকার "আবর্জনা"র সাথে লড়াই! 	output: আর হাজর লগে লড়াই লগে লড়াই 


In [20]:
input = "কোথায়?"
output = decode_sequence(input).replace("[start]","")
output = output.replace("[end]","")
print("input:" + input,"\toutput:" + output)

input:কোথায়? 	output: হুদু 


In [21]:
from itertools import chain
f = open("/content/result_chakma_data.txt","w")
cnt = 1

one_dim_ref = list(chain.from_iterable(references))
for i in range(len(predictions)):
    f.write("ref{}:".format(cnt) + one_dim_ref[i] + "\n")
    f.write("pred{}:".format(cnt) + predictions[i] + "\n")
    cnt += 1

len(predictions)

443

In [None]:
!pip install evaluate bert_score

In [23]:
import evaluate

bleu = evaluate.load('bleu')

print(bleu.compute(predictions=predictions , references=references))

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.20710572694499058, 'precisions': [0.528812230497844, 0.29411764705882354, 0.17237308146399055, 0.11036036036036036], 'brevity_penalty': 0.8880058481868938, 'length_ratio': 0.893833216538192, 'translation_length': 2551, 'reference_length': 2854}


In [24]:
flatten_list = [j for references in references for j in references]
bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=predictions, references=flatten_list, model_type="distilbert-base-uncased")

average_f1 = sum(results['f1']) / len(results['f1'])
print("Average F1-score:", average_f1)

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Average F1-score: 0.9343958132423074


In [None]:
transformer.save_weights("/content/bn_to_chk.h5")

In [None]:
f1 = open("/content/train_bangla_data.txt","w")
f2 = open("/content/train_chakma_data.txt","w")

for i in range(len(train_source_texts)):
    f1.write(train_source_texts[i] + "\n")
    f2.write(train_target_texts[i] + "\n")

print(len(train_source_texts),len(train_target_texts))

7976 7976


In [None]:
print(len(flatten_list),len(references))

443 443
