In [1]:
# Using NLTK package as a toolkit for working with NLP in Python. 
# It provides various text processing libraries with a lot of test datasets.
import nltk
#nltk.download()

# Using the nltk.translate packages IBMModel1, Alignment and IBMModel1 for testing the translation between 2 languages corpus using IBM model 1.
from nltk.translate import AlignedSent, Alignment, IBMModel1

# Defaultdict is a sub-class of the dictionary class that returns a dictionary-like object.
# Using it for for making higher dimensional dictionaries.
from collections import defaultdict

import json # Extracting data from the .json files

source_language = 'ar' # The source language tag for foreign language.
translation_language = 'tr' # The translation language tag for English language.
FILE = 'ar_tr_pair_small.json' #data file with the translations
iterations_num = 2 # Number of iterations to run the EM algorithm.

# Function which takes an array containing pair(s) of sentences of a foreign & it's equivalent in English sentence as a parameter.

### Returning a defaultdict with tags for each word as either 'f' foreign word or 'e' English word.

In [2]:
def word_tag(corpus):
    corpus_words = defaultdict(set)
    for x in corpus:
        for y in x:
            words = x[y].split()
            for word in words:
                corpus_words[y].add(word)
    return corpus_words

# Function for training the model with EM algorithm with a specified number of iterations

In [3]:
def ibm1_EM(word_tags):
    
    sentence_pair = word_tag(word_tags) # Variable containing the pair of sentences (source & translation) with tags. 
    s_total = {} # Initializing an array for s_total.
    
    # Initializing required dictionaries.
    for word in sentence_pair[source_language]:
        s_total[word] = 0.0
    
    
    # Initialize t(e|f) uniformly.
    uniformly_dict = defaultdict(dict) # Creating a variable for the dictionary.
    for word_source in sentence_pair[source_language]: # for loop which runs over all words with tags as 'ar'.
        for word_translation in sentence_pair[translation_language]: # for loop which runs over all words with tags as 'tr'.
            uniformly_dict[word_source][word_translation] = 1 / len(sentence_pair[source_language]) # The transition_probabilities to 1/(length of the sentence in the source language).
    init_prob = uniformly_dict # The initial probability.
    iterations = 1 # Starting the iteration as 0.
    
    
    # While not converged do:
    while iterations <= iterations_num:
        current_prob = init_prob # A variable containing the initial probability.
        count = defaultdict(dict) # Creating a count variable containing a dictionary for all e and f values.
        total = defaultdict(dict) # Creating a total variable containing a dictionary for all f values.

        # Initializing count:
        # Nested for loop which runs over all words in both Arabic and Turkish languages and assigning their weight as 0.
        for word_source in sentence_pair[source_language]:
            for word_translation in sentence_pair[translation_language]:
                count[word_source][word_translation] = 0 # Assigning the weight of all words as 0.

        # Initializing total:
        # for loop which runs over all words in Turkish language and assigning their weight as 0.
        for word in sentence_pair[translation_language]:
            total[word] = 0 # Assigning the weight of all words in the Turkish language as 0.


        # For all sentence pair (e|f) do:
        # Compute normalization:
        print("Iteration ", "(", iterations, ")")
        for (es, fs) in [(pair[source_language].split(), pair[translation_language].split()) for pair in word_tags]:
            for e in es:  # For all words e in es do:
                s_total[e] = 0 # Intitialize s_total as 0.
                for f in fs: # For all words f in fs do:
                    s_total[e] += current_prob[e][f] # Updating the initial probability.
            print("s-total(e):: ", "ar:",es, "||", "tr:", fs, "==> ", s_total[e]) # Printing s-total(e).
            
                    

            # Collect counts:
            for e in es: # For all words e in e do:
                for f in fs: # For all words f in f do:
                    count[e][f] += (current_prob[e][f] / s_total[e])
                    total[f] += current_prob[e][f] / s_total[e]
            print("count(e|f):: ", "ar:",es, "||", "tr:", fs, "==> ", count[e][f]) # Printing count(e|f).
            print("total(f):: ", "ar:",es, "||", "tr:", fs, "==> ", total[f]) # Printing total(f).


        # Estimate probabilities:
        for f in sentence_pair[translation_language]: # For all Turkish words f do:
            for e in sentence_pair[source_language]: # For all Arabic words e do:
                current_prob[e][f] = count[e][f] / total[f]
                print("t(e|f):: ", "ar:",e ,"||", "tr:", f, "==> ", current_prob[e][f]) # Printing t(e|f).
                
        print("--------------------------------------------------------------------------")
        
        init_prob = current_prob # Updating the init_prob to the value of the current_prob.
        iterations += 1 # increament for the number of iteration inside the while loop.

    return current_prob # Returning the value of the current_prob.

# Function for acquiring the maximum translation probabilities as a dictionary

In [4]:
def maximum_prob(current_prob):
    prob_result = {} # Initializing an array for prob_result.
    
    source_key = current_prob.keys() # .keys() method returns a view object that displays a list of all the keys in the dictionary in order of insertion.
    translation_word = list(current_prob.values()) # A variable which stores the values of the dictionary inside a list.
    translation_key = translation_word[0].keys() # A variable which extract the keys of the dictionary starting from position 0.

    for word_source in source_key: # for loop which runs over all the keys titled as 'ar' Arabic language.
        max_prob = 0.0 # Initializing the max_prob variable as 0.
        max_translation_word = "" # A variable which stores the words of Turkish language.
        for word_translation in translation_key: # for loop which runs over all the keys titled as 'tr' Turkish language.
            if current_prob[word_source][word_translation] >= max_prob: # if condition which takes the pair of sentences with their current_prob and compare them with the max_prob.
                max_prob = current_prob[word_source][word_translation] # if the current_prob is bigger or equal to the max_prob, then assign the pair of sentences with their current_prob as max_prob.
                max_translation_word = word_translation  # A variable which stores the words of Turkish language.
        
        prob_result[word_source] = max_translation_word # Storing the words in an array from Turkish language with the most probability with their corresponding from Arabic language in the same position.

    return prob_result # Returning the maximum probability.

# Function for printing the alignment similarly to the IBM nltk model

In [5]:
def alignment(prob_result, word_tags):
    
    num = 1
    for x in word_tags: # for loop which runs over all the sentences.
        print("Source sentence", num, ":")
        print(x[source_language].split()) # Taking the Arabic sentence and split the words.
        print("Translation sentence", num, ":")
        print(x[translation_language].split()) # Taking the Turkish sentence and split the words.
        print("Alignment:")
        num += 1
        
        # Nested for loop for the alignment between each Araibc word with it's corresponding Turkish word.
        i = 0
        for word_source in x[source_language].split():
            j = 0
            for word_translation in x[translation_language].split():
                if word_translation == prob_result[word_source]: # if condition to match each Arabic word with it's corresponding Turkish word.
                    alignment = str(i) + "-" + str(j) + " "
                    print(alignment, end = "")
                    j = j + 1
                    break
                j = j + 1
            i = i + 1
        print("\n")

# Function for the translation from Turkish to English

In [6]:
def arabic_turkish():
    '''
    sentence_pair_tr = ['الطريق طويل', 'the road is long',
                        'الطريق كبير', 'the road is big',
                        'الطريق إلى الجبل طويل', 'the road to the mountain is long',
                        'أتيت إلى الجبل','I came to the mountain',
                        'تسلقت الجبل', 'I climbed the mountain']
    '''
    #parsing the json file and storing it in an object
    with open(FILE, 'r', encoding = 'utf-8') as f:
        corpus = json.load(f)
    
    # Calling the ibm1_EM function with passing the value obtained by the word_tag function.
    ibm_em_tr = ibm1_EM(corpus)
    
    # Calling the maximum_prob function with passing the value obtained by the ibm1_EM function.
    alignment_tr = maximum_prob(ibm_em_tr)
    
    # Calling the alignment function with passing the values obtained by the maximum_prob function and the object containing sentences with tags from word_tag function.
    alignment(alignment_tr, corpus)

# Main function

In [7]:
def main():
    arabic_turkish() # Translation from Turkish to English. (My Model)
    #nltk_ibm1() # Translation from Turkish to English. (NLTK IBM Model 1)

main()

Iteration  ( 1 )
s-total(e)::  ar: ['بالنسبة', 'لي،', 'تبدأ', 'هذه', 'القصة', 'قبل', 'حوالي', '15', 'عاماً', 'مضت'] || tr: ['Benim', 'için', 'bu', 'hikaye', '15', 'sene', 'önce'] ==>  0.060344827586206906
count(e|f)::  ar: ['بالنسبة', 'لي،', 'تبدأ', 'هذه', 'القصة', 'قبل', 'حوالي', '15', 'عاماً', 'مضت'] || tr: ['Benim', 'için', 'bu', 'hikaye', '15', 'sene', 'önce'] ==>  0.14285714285714282
total(f)::  ar: ['بالنسبة', 'لي،', 'تبدأ', 'هذه', 'القصة', 'قبل', 'حوالي', '15', 'عاماً', 'مضت'] || tr: ['Benim', 'için', 'bu', 'hikaye', '15', 'sene', 'önce'] ==>  1.4285714285714282
s-total(e)::  ar: ['عندما', 'كنت', 'طبيباً', 'نزيلاً', 'في', 'جامعة', 'شيكاغو'] || tr: ['Chicago', "Üniversitesi'nde", 'bir', 'darülacaze', 'doktoruyken', 'başladı'] ==>  0.05172413793103449
count(e|f)::  ar: ['عندما', 'كنت', 'طبيباً', 'نزيلاً', 'في', 'جامعة', 'شيكاغو'] || tr: ['Chicago', "Üniversitesi'nde", 'bir', 'darülacaze', 'doktoruyken', 'başladı'] ==>  0.16666666666666666
total(f)::  ar: ['عندما', 'كنت', 'طبيباً',

# Function implementing the IBM1 with EM algorithm model

In [8]:
def nltk_ibm1():
    print("############### Translation from Turkish to English Using NLTK IBM model 1 ###############")

    # Array containing the pair of sentences, source and translation.
    bitext = []
    # Using AlignedSent for Sentence-aligned parallel corpus.
    bitext.append(AlignedSent(['yol','uzun'], ['الطريق' ,'طويل']))
    #bitext.append(AlignedSent(['yol', 'büyük'], ['الطريق' ,'كبير']))
    bitext.append(AlignedSent(['dağa', 'yol', 'uzun'], ['الطريق', 'إلى' , 'الجبل' , 'طويل']))
    #bitext.append(AlignedSent(['dağa', 'geldim'], ['أتيت', 'إلى', 'الجبل']))
    #bitext.append(AlignedSent(['dağa', 'tırmandım'], ['تسلقت', 'الجبل']))

    myIBM = IBMModel1(bitext, 5) # Passing 2 parameters to the IBMModel1 as the array of Sentence-aligned parallel corpus and number of iterations.
    
 
    # NLTK AlignedSent is a 2-tuple (words_index, mots_index). To extract the required data it needs a for loop for running over the array bitext and 2 inner nested loop for both indexes.
    # Printing t(e|f) for all words and their mots in the other language.
    for i in range(len(bitext)):
        for j in range(len(bitext[i].mots)):
            for k in range(len(bitext[i].words)):
                print("t(e|f):: ", "f:",bitext[i].words[k] ,"||", "e:", bitext[i].mots[j], "==> ", myIBM.translation_table[bitext[i].words[k]][bitext[i].mots[j]])
    print("\n")
 
    # Printing the alignments between the pair of sentences.
    for i in range(len(bitext)):
        print("Source sentence", i + 1, ":")
        print(bitext[i].words)
        
        print("Translation sentence", i + 1, ":")
        print(bitext[i].mots)
        
        print("Alignment:")
        print(bitext[i].alignment, '\n')