In [None]:
# Statistical Machine Translation System
# Myanmar to Thai

# IBM Model 1 for Word Translation Task
# Word Alignment based on Relative Positions
# Bi-gram Language Modelling with Laplace Smoothing and Backoff

In [None]:
import pickle

In [None]:
tokenized_stores = {'my_train': [], 'my_dev': [], 'my_test': [], 'th_train': [], 'th_dev': [], 'th_test': []}

In [None]:
# Load data files into your Google Drive in a directory named "NLP_Translation"
# Alternatively, provide location to the folder 'data_file'

for key in tokenized_stores:
    file_name = "/content/" + str(key)[3:] + "." + str(key)[0:2]
    load = open(file_name)
    sentences = load.read().split('\n')
    
    for sentence in sentences:
        token_store = sentence.split(' ')
        tokenized_stores[key].append(token_store)

In [None]:
print(tokenized_stores['th_train'][2])

['มัน', 'ยาก', 'สำหรับ', 'เรา']


In [None]:
print(tokenized_stores['my_train'][2])

['အဲ့ဒါ', 'ကျွန်တော်တို့', 'အတွက်', 'ခက်ခဲတယ်', '။']


In [None]:
train_size = len(tokenized_stores['my_train'])
dev_size = len(tokenized_stores['my_dev'])
test_size = len(tokenized_stores['my_test'])

In [None]:
# making the vocabulary

my_words = {}
th_words = {}

for key in tokenized_stores:
    if str(key)[0] == 'm':
        # creating my_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in my_words:
                    my_words[word] += 1
                else:
                    my_words[word] = 1
    else:
        # creating th_words
        for sentence in tokenized_stores[key]:
            for word in sentence:
                if word in th_words:
                    th_words[word] += 1
                else:
                    th_words[word] = 1
                    
my_vocab = len(my_words)
th_vocab = len(th_words)
print("Number of Unique Words:")
print("> Myanmar :", str(my_vocab))
print("> Thai :", str(th_vocab))

Number of Unique Words:
> Myanmar : 16586
> Thai : 6824


In [None]:
# creating the 't'
t = {}
# usage: t[('my_word', 'th_word')] = probability of my_word given th_word
uniform = 1 / (my_vocab * th_vocab)

In [None]:
n_iters = 0
max_iters = 25

fine_tune = 1
has_converged = False

while n_iters < max_iters and has_converged == False:
    has_converged = True
    max_change = -1

    n_iters += 1
    count = {}
    total = {}
    for index in range(train_size):
        s_total = {}
        for my_word in tokenized_stores['my_train'][index]:
            s_total[my_word] = 0
            for th_word in tokenized_stores['th_train'][index]:
                if (my_word, th_word) not in t:
                    t[(my_word, th_word)] = uniform
                s_total[my_word] += t[(my_word, th_word)]

        for my_word in tokenized_stores['my_train'][index]:
            for th_word in tokenized_stores['th_train'][index]:
                if (my_word, th_word) not in count:
                    count[(my_word, th_word)] = 0
                count[(my_word, th_word)] += (t[(my_word, th_word)] / s_total[my_word])

                if th_word not in total:
                    total[th_word] = 0
                total[th_word] += (t[(my_word, th_word)] / s_total[my_word])

    # estimating the probabilities

    if fine_tune == 0:
      updated = {}
      # train for all valid word pairs s.t count(my_word, th_word) > 0
      for index in range(train_size):
          for th_word in tokenized_stores['th_train'][index]:
              for my_word in tokenized_stores['my_train'][index]:
                  if (my_word, th_word) in updated:
                      continue
                  updated[(my_word, th_word)] = 1
                  if abs(t[(my_word, th_word)] - count[(my_word, th_word)] / total[th_word]) > 0.01:
                      has_converged = False
                      max_change = max(max_change, abs(t[(my_word, th_word)] - count[(my_word, th_word)] / total[th_word]))
                  t[(my_word, th_word)] = count[(my_word, th_word)] / total[th_word]

    elif fine_tune == 1:
      # train it only for 1000 most frequent words in Myanmar and Thai
      max_words = 1000
      n_th_words = 0
      updates = 0

      for hi_word_tuples in sorted(th_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
          hi_word = hi_word_tuples[0]
          n_th_words += 1
          if n_th_words > max_words:
              break
          n_my_words = 0
          for en_word_tuples in sorted(my_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
              en_word = en_word_tuples[0]
              n_my_words += 1
              if n_my_words > max_words:
                  break
              if (en_word, hi_word) not in count or hi_word not in total:
                  continue
                  # assume in this case: t[(en_word, hi_word)] = uniform
              else:
                  if abs(t[(en_word, hi_word)] - count[(en_word, hi_word)] / total[hi_word]) > 0.005:
                      has_converged = False
                      max_change = max(max_change, abs(t[(en_word, hi_word)] - count[(en_word, hi_word)] / total[hi_word]))
                  t[(en_word, hi_word)] = count[(en_word, hi_word)] / total[hi_word]

    print("Iteration " + str(n_iters) + " Completed, Maximum Change: " + str(max_change))


Iteration 1 Completed, Maximum Change: 0.16918382870330934
Iteration 2 Completed, Maximum Change: 0.0692329591471128
Iteration 3 Completed, Maximum Change: 0.032508388875689076
Iteration 4 Completed, Maximum Change: 0.022811301847502463
Iteration 5 Completed, Maximum Change: 0.018696638366326457
Iteration 6 Completed, Maximum Change: 0.016098406826624834
Iteration 7 Completed, Maximum Change: 0.013927027679601522
Iteration 8 Completed, Maximum Change: 0.012092786373564701
Iteration 9 Completed, Maximum Change: 0.010531428832179723
Iteration 10 Completed, Maximum Change: 0.009195383603660234
Iteration 11 Completed, Maximum Change: 0.008047366160447433
Iteration 12 Completed, Maximum Change: 0.007057065545159058
Iteration 13 Completed, Maximum Change: 0.006199503081309565
Iteration 14 Completed, Maximum Change: 0.005716312529534598
Iteration 15 Completed, Maximum Change: 0.00541558747879034
Iteration 16 Completed, Maximum Change: 0.005124525308382602
Iteration 17 Completed, Maximum Chang

In [None]:
# displaying the most confident translation pairs
limit = 40
for element in sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True):
  print(element)
  limit -= 1
  if limit <= 0:
    break

(('ကို', 'ကို'), 0.8551800022867546)
(('ဘယ်လောက်', 'ဘယ်လောက်'), 0.7457159069049341)
(('မ', 'မ'), 0.7444588707539402)
(('သလဲ', 'သလဲ'), 0.7330023235457133)
(('မင်း', 'မင်း'), 0.730701209305113)
(('ကျွန်တော်', 'ကျွန်တော်'), 0.7206821276466951)
(('ခင်ဗျား', 'ခင်ဗျား'), 0.7055033315268444)
(('လဲ', 'လဲ'), 0.7022385908737969)
(('အဲ့ဒါ', 'အဲ့ဒါ'), 0.6956404606617393)
(('သူ', 'သူ'), 0.6922926133678208)
(('မှာ', 'မှာ'), 0.6912667719269686)
(('ဘူးလား', 'ဘူးလား'), 0.6839729792710393)
(('သူမ', 'သူမ'), 0.6570930877951112)
(('ရေဒီယို', 'วิทยุ'), 0.6422682076536538)
(('၊', ','), 0.6397392884133926)
(('သူတို့', 'သူတို့'), 0.6388409381711743)
(('တွေ', 'တွေ'), 0.6384768649720629)
(('။', ''), 0.6355915451937)
(('ဖို့', 'ဖို့'), 0.6346974727218071)
(('လုပ်', 'လုပ်'), 0.627662274838754)
(('ဘာ', 'ဘာ'), 0.6247311526075122)
(('အတွက်', 'สำหรับ'), 0.620955276597172)
(('ဘူး', 'ဘူး'), 0.6191945447191133)
(('ပါဘူး', 'ပါဘူး'), 0.6167823236497844)
(('နဲ့', 'နဲ့'), 0.6158618268303384)
(('ဘယ်သူ့', 'ဘယ်သူ့'), 0.61476389

In [None]:
# saving the translation model
file = open("translation_model.pkl","wb")
pickle.dump(t, file)
file.close()

In [None]:
# using the model trained until convergence
# to use a saved model
model_name = "translation_model.pkl"
pickle_in = open(model_name,"rb")
t = pickle.load(pickle_in)

In [None]:
I = {}
for index in range(train_size):
    for en_id in range(len(tokenized_stores['my_train'][index])):
        length = len(tokenized_stores['my_train'][index])
        if length not in I:
            I[length] = {} # maps the positional difference to a tuple: (sum of t's, count)
        for hi_id in range(len(tokenized_stores['th_train'][index])):
            if (hi_id - en_id) not in I[length]:
                I[length][(hi_id - en_id)] = [t[(tokenized_stores['my_train'][index][en_id], tokenized_stores['th_train'][index][hi_id])], 1]
            else:
                I[length][(hi_id - en_id)][0] += t[(tokenized_stores['my_train'][index][en_id], tokenized_stores['th_train'][index][hi_id])]
                I[length][(hi_id - en_id)][1] += 1

In [None]:
# viewing the available sentence lengths encountered during training
sentence_lengths = []
for key in I.keys():
    if key not in sentence_lengths:
        sentence_lengths.append(key)
sentence_lengths.sort()
print(sentence_lengths)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 29, 30, 33, 39]


In [None]:
# computing the alignment probabilities
# p[I][hi_id - en_id] = p(i | i', I)

p = {}
for key in I.keys():
    p[key] = {}
    sum_val = 0
    for diff in I[key].keys():
        p[key][diff] = I[key][diff][0] / I[key][diff][1]
        sum_val += p[key][diff]
    for diff in p[key].keys():
        p[key][diff] /= sum_val

In [None]:
for index in range(train_size):
    length_my = len(tokenized_stores['my_train'][index])
    length_th = len(tokenized_stores['th_train'][index])
    if length_th - length_my > 10 and length_my == 1:
        print("Length of Myanmar Sentence:", str(length_my))
        print("Length of Thai Sentence:", str(length_th))
        
# there exists an Myanmar sentence with one token s.t the Thai translation contains 19 tokens

In [None]:
# computing initial transitions
init = {}
for length in p:
    max_prob = -1
    max_jump = 0
    for key in p[length].keys():
        if p[length][key] > max_prob:
            max_prob = p[length][key]
            max_jump = key
    init[length] = max_jump

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# computing the transition probabilities for Thai
bigrams = {}
unigrams = {}

# training on the train_set
def model(dataset_size, dataset_name):
    global bigrams
    global unigrams
    for index in range(dataset_size):
        token_A = ''
        for hi_token in tokenized_stores[dataset_name][index]:
            if hi_token not in unigrams:
                unigrams[hi_token] = 1
            else:
                unigrams[hi_token] += 1
            
            token_B = hi_token
            if (token_A, token_B) not in bigrams:
                bigrams[(token_A, token_B)] = 1
            else:
                bigrams[(token_A, token_B)] += 1
            token_A = token_B

model(train_size, 'th_train')
model(dev_size, 'th_dev')

bigram_count = len(bigrams)
unigram_count = len(unigrams)
print("Number of Unique Bigrams:", bigram_count)
print("Number of Unique Unigrams:", unigram_count)

Number of Unique Bigrams: 33708
Number of Unique Unigrams: 6633


In [None]:
from itertools import permutations
import nltk

computed_sentences = []
total_BLEU = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 7: 0}
null_BLEU_count = 0

sorted_t = sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True)

def find_translation(en_token):
    for element in sorted_t:
        if element[0][0].lower() == en_token:
            return element[0][1]
    return ""

def get_prob(seq):
    # bigram language model with laplace smoothing and backoff
    if len(seq) < 2:
        return 1
    score = 0
    token_A = ''
    for hi_token in seq:
        token_B = hi_token
        if (token_A, token_B) not in bigrams:
            if token_B not in unigrams:
                continue
            else:
                score += unigrams[token_B] / unigram_count
        else:
            base_token_count = 0
            if token_A in unigrams:
                base_token_count = unigrams[token_A]
            score += (bigrams[(token_A, token_B)] + 1) / (base_token_count + unigram_count)
        token_A = token_B
    return score

count = 0
for index in range(test_size):
    if len(tokenized_stores['my_test'][index]) > 8 or len(tokenized_stores['my_test'][index]) < 2:
        continue

    translated_words = []
    for en_token in tokenized_stores['my_test'][index]:
        translation = find_translation(en_token)
        if translation != "":
            translated_words.append(translation)

    perm = permutations(translated_words)

    best_seq = translated_words
    best_prob = -1

    for seq in perm:
        prob = get_prob(seq)
        if prob > best_prob:
            best_prob = prob
            best_seq = seq

    BLEU_scores = []
    # Collecting BLEU_scores with various kinds of Smoothing
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['th_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method1))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['th_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method2))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['th_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method3))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['th_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method4))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['th_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method5))
    BLEU_scores.append(nltk.translate.bleu_score.sentence_bleu([tokenized_stores['th_test'][index]], best_seq, smoothing_function=nltk.translate.bleu_score.SmoothingFunction().method7))

    for key in total_BLEU.keys():
        if key == 7:
            consider = 5
        else: consider = key - 1
        total_BLEU[key] += BLEU_scores[consider]
    
    if BLEU_scores[0] == 0:
        null_BLEU_count += 1
    
    count += 1
    print("Sentence Index: ", str(count))
    print("Myanmar Sentence:", str(tokenized_stores['my_test'][index]))
    print("Reference Thai Sentence:", str(tokenized_stores['th_test'][index]))
    print("Translated Sentence:", str(best_seq))
    print("Translation BLEU Scores", str(BLEU_scores))
    print()
    
    computed_sentences.append([tokenized_stores['my_test'][index], tokenized_stores['th_test'][index], best_seq, BLEU_scores])

tested = count

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Translation BLEU Scores [0, 0, 0, 0, 0, 0]

Sentence Index:  632
Myanmar Sentence: ['သူတို့ရဲ့', 'နမော်နမဲ့နိုင်မှု', 'ကြောင့်', 'သူတို့', 'စာမေးပွဲ', 'ကျ', 'ခဲ့ကြတယ်', '။']
Reference Thai Sentence: ['เนื่องจาก', 'ความ', 'ฟุ้งซ่าน', 'ของ', 'พวกเขา', 'พวกเขา', 'ทำ', 'ให้การ', 'สอบ', 'ของ', 'พวกเขา']
Translated Sentence: ('สอบ', 'ผัก', 'สถานการณ์', 'သူတို့', 'น้ำตา', 'แรก')
Translation BLEU Scores [0.01774239756616722, 0.08389861810900508, 0.03527502360630137, 0.016338026308907974, 0.037172650766057885, 0.04838439061032186]

Sentence Index:  633
Myanmar Sentence: ['ပါးစပ်', 'ပလုတ်ပလောင်း', 'နဲ့', 'စကား', 'မ', 'ပြော', 'နဲ့', '။']
Reference Thai Sentence: ['อย่า', 'พูด', 'ด้วย', 'ปาก']
Translated Sentence: ('ไหม', 'နဲ့', 'คำพูด', 'ပြော', 'မ', 'နဲ့')
Translation BLEU Scores [0, 0, 0, 0, 0, 0]

Sentence Index:  634
Myanmar Sentence: ['သူတို့', 'မင်း', 'ကို', 'မေး', 'ချင်', 'ကြမှာ', '။']
Reference Thai Sentence: ['พวกเขา', 'จะ',

In [None]:
# Results:
import statistics
print("Number of Samples Tested Upon: " + str(tested))
print()

print("Average BLEU Score using Various Smoothing Functions (considering all test samples)")
for key in total_BLEU:
    print("Method " + str(key) + ": " + str(total_BLEU[key] / tested))
print()
print("Average BLEU Score using Various Smoothing Functions (considering test samples with at-least one word overlap)")
for key in total_BLEU:
    print("Method " + str(key) + ": " + str(total_BLEU[key] / (tested - null_BLEU_count)))

Number of Samples Tested Upon: 1464

Average BLEU Score using Various Smoothing Functions (considering all test samples)
Method 1: 0.03006521281592783
Method 2: 0.12545505795636563
Method 3: 0.059744913918472316
Method 4: 0.024065898832130665
Method 5: 0.04594641597317043
Method 7: 0.061441180334108496

Average BLEU Score using Various Smoothing Functions (considering test samples with at-least one word overlap)
Method 1: 0.05322306113968361
Method 2: 0.22208730936894716
Method 3: 0.1057636686537406
Method 4: 0.04260275198336069
Method 5: 0.08133682343980836
Method 7: 0.10876649094212193
