# IBM model 1 by SUBRAMANYAM SAHOO


In [56]:
import numpy as np
import pandas as pd
import math
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.translate.metrics import alignment_error_rate
from nltk.metrics.scores import (precision, recall)
from operator import itemgetter
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
#DATA
file = open('./train.en', 'r', encoding="utf8")
train_en = file.read()
raw_sentences_train_en = train_en.split("\n")

file = open('./train.hi', 'r', encoding="utf8")
train_hi = file.read()
raw_sentences_train_hi = train_hi.split("\n")

file = open('./test.en', 'r', encoding="utf8")
test_en = file.read()
raw_sentences_test_en = test_en.split("\n")

file = open('./test.hi', 'r', encoding="utf8")
test_hi = file.read()
raw_sentences_test_hi = test_hi.split("\n")

file = open('./dev.en.txt', 'r', encoding="utf8")
dev_en = file.read()
raw_sentences_dev_en = dev_en.split("\n")

file = open('./dev.hi', 'r', encoding="utf8")
dev_hi = file.read()
raw_sentences_dev_hi = dev_hi.split("\n")

In [27]:
def add_null(data):
    list_ = []
    for sentence in data:
        list_.append("NULL " + sentence)
#     print(list_[2])
    return list_

In [28]:
sentences_dev_hi = add_null(raw_sentences_dev_hi)
sentences_test_hi = add_null(raw_sentences_test_hi)
sentences_train_hi = add_null(raw_sentences_train_hi)

In [29]:
def make_lower_case(data):
    list_ = []
    for sentence in data:
        list_.append("NULL " + sentence.lower())
#     print(list_[2])
    return list_

In [30]:
sentences_dev_en = make_lower_case(raw_sentences_dev_en)
sentences_test_en = make_lower_case(raw_sentences_test_en)
sentences_train_en = make_lower_case(raw_sentences_train_en)

In [31]:
def is_converged(new, old, epoch):
    epsilone = 0.00000001
#     new = list(new.values())
#     old = list(old.values())
    
#     new_val = []
#     for (hin, ddict) in new.items():
#         for (eng, prob) in ddict.items():
#             new_val.append('{0:.8f}'.format(prob))
            
#     old_val = []
#     for (hin, ddict) in old.items():
#         for (eng, prob) in ddict.items():
#             old_val.append('{0:.8f}'.format(prob))        

#     for i in range(len(old_val)):
#         if math.fabs(float(new_val[i]) - float(old_val[i])) > epsilone: 
    if epoch < 15:
        return False
    return True

In [32]:
def perform_EM(en_sentences, hi_sentences):
    
    uni_ini = 0.00001
    
    translation_prob = defaultdict(lambda: float(uni_ini))
    translation_prob_prev = defaultdict(float)
    
    epoch = 0
    
    while True:

        epoch += 1
        print("epoch num:", epoch,"\n")
        count = defaultdict(float)
        total = defaultdict(float)
        s_total = defaultdict(float)
        
        for index_sen, hin_sen in enumerate(hi_sentences):
            #compute normalization
            hin_sen_words = hin_sen.split(" ")
            for hin_word in hin_sen_words:
                s_total[hin_word] = 0
                eng_sen_words = en_sentences[index_sen].split(" ")
                for eng_word in eng_sen_words:
                        s_total[hin_word] += translation_prob[(hin_word, eng_word)]
            
            #collect counts
            for hin_word in hin_sen_words:
                eng_sen_words = en_sentences[index_sen].split(" ")
                for eng_word in eng_sen_words:
                        count[(hin_word, eng_word)] += translation_prob[(hin_word, eng_word)]/s_total[hin_word]
                        total[eng_word] += translation_prob[(hin_word, eng_word)]/s_total[hin_word]                   

        #estimate probabilities
        for (hin_word, eng_word) in translation_prob.keys():
                translation_prob[(hin_word, eng_word)] = count[(hin_word, eng_word)]/total[eng_word]

        if is_converged(translation_prob, translation_prob_prev, epoch) == True:
            break
        
        translation_prob_prev = translation_prob
        
        
    return translation_prob


In [33]:
def train_model(sentences_train_en, sentences_train_hi):
    
    translation_prob = perform_EM(sentences_train_en, sentences_train_hi)
    return translation_prob


In [34]:
tef = train_model(sentences_train_en, sentences_train_hi)

epoch num: 1 

epoch num: 2 

epoch num: 3 

epoch num: 4 

epoch num: 5 

epoch num: 6 

epoch num: 7 

epoch num: 8 

epoch num: 9 

epoch num: 10 

epoch num: 11 

epoch num: 12 

epoch num: 13 

epoch num: 14 

epoch num: 15 



In [35]:
iterations = 0
for ((e_word, hin_word), value) in sorted(tef.items(), key=itemgetter(1), reverse=True):
    if iterations < 200:
        print("{:<20}{:>20.2}".format("P(%s|%s)" %(e_word, hin_word), value))
    else:
        break
    iterations += 1

P(तक|till)                           1.0
P(शरीर|body)                        0.99
P(तनाव|tension)                     0.99
P(,|,)                              0.99
P(सेरीब्रम|cerebrum)                0.99
P(पेट|stomach)                      0.99
P(15|15)                            0.99
P(या|or)                            0.98
P(कैंसर|cancer)                     0.98
P(कम|less)                          0.98
P(स्तर|level)                       0.98
P(मस्तिष्क|brain)                   0.98
P(मानसिक|mental)                    0.97
P(ग्राम|grams)                      0.97
P(भारत|india)                       0.97
P(आधार|basis)                       0.97
P(संक्रमित|infected)                0.97
P(दो|two)                           0.97
P(पानी|water)                       0.96
P(भी|also)                          0.96
P(उपलब्ध|available)                 0.96
P(नमक|salt)                         0.96
P(नाम|name)                         0.96
P(शारीरिक|physical)                 0.95
P(3|3)          

In [36]:
def translate_sentence(sentence, tef, file):
    '''
    takes the best translation of an hindi word
    and appends to eng sentence
    '''
    global predicted_eng
    eng_sentence = []
    print("hin:",sentence)
    
    tokens = sentence.split(" ")
    for idx, token in enumerate(tokens):
        probabilities = []
        eng_words = []
#         print("token id, token", idx, token)
        max_score = -1
        max_eng_word = ""
        for k, v in tef.items():
            if token == k[0]:
                probabilities.append(v)
                eng_words.append(k[1])
#             print(f'{k[0]} - {v}')
#         probabilities = list(tef[token].values())
#         eng_words = list(tef[token].keys())
        for tef_index, prob in enumerate(probabilities):
            if prob > max_score:
                max_score = prob
                max_eng_word = eng_words[tef_index]
        
        eng_sentence.append(max_eng_word)
#         predicted_eng.append(max_eng_word)
    
    eng_sentence = " ".join(eng_sentence)
    print("eng:", eng_sentence)
    file.write(eng_sentence)
    file.write("\n")
    
    return eng_sentence


In [37]:
def test_model(dataset, tef):
#     tef = np.load('./models/IBMmodel1tef.npy')
    file = open("dev_translations.txt", 'w+')
    translated_data = []
    for sentence in dataset:
        translation = translate_sentence(sentence, tef, file)
        translated_data.append(translation)
    
    file.close()
    return translated_data

In [38]:
predicted_translations = test_model(raw_sentences_test_hi, tef)

hin: ताजा साँसें और चमचमाते दाँत आपके व्यक्तित्व को निखारते हैं ।
eng: freshly breathing and  teeth your personality responsible  are .
hin: दाँतों से आपका आत्मविश्‍वास भी बढ़ता है ।
eng: teeth from ease  also malfunction is .
hin: हमारे मसूढ़ों और दाँतों के बीच बैक्टीरिया मौजूद होते हैं ।
eng: our  and teeth near between bacteria present dried are .
hin: ये दाँतों को गंदा और साँसों को बदबूदार बना देते हैं ।
eng: 2-3 teeth responsible dirty and  responsible  makes advises are .
hin: यहाँ दिए कुछ आसान नुस्खों की मदद से आप अपने दाँतों को स्वच्छ और साँसों को ताजा रख सकते हैं ।
eng: here glaxosmithkline some easy  comprehensive helps from you 's teeth responsible clean and  responsible freshly pillow broken are .
hin: दाँतों को ठीक से साफ करें ।
eng: teeth responsible cured from clean nets .
hin: दाँतों को ठीक से साफ करने में दो से तीन मिनट का समय लगता है ।
eng: teeth responsible cured from clean doing in two from three minutes original time starts is .
hin: लेकिन ज्यादातर लोग इसके लिए एक 

eng: fever comprehensive tested nets and other aforesaid tested near post only  proved dry can is .
hin: मेटासिन की गोली देने से बुखार उतर जाता है और फिर 103 तक बुखार चढ़ जाता है ।
eng:  comprehensive natrum giving from fever bhang capsular is and else  till fever  capsular is .
hin: एक्सरे , टीसी-डीसी नार्मल है ।
eng: x-ray ,   is .
hin: चार - चार घंटे पर फीवर को मापें ।
eng: four - four hours on  responsible  .
hin: अगर 100 से ज्यादा फीवर हो तो पैरासीटामोल की गोली दें और किसी औषधि विशेषज्ञ चिकित्सक से परामर्श लें ।
eng: cautious 100 from more  dry then  comprehensive natrum switch and any stores specialist doctors from my take .
hin: एच.आई.वी. क्या है ?
eng: hiv what is what
hin: यह कैसे होता है ?RD_PUNC
eng: this how happens is 
hin: बचाव के उपाय बताएँ ?
eng: prevention near solution  what
hin: यह एक प्रकार का वायरस होता है ।
eng: this one type original virus happens is .
hin: इसका फैलाव मुख्यत: असुरक्षित यौन संबंध , संक्रमित सूई , संक्रमित रक्त व माँ से बच्चों में होता है ।
eng: re

eng: central comprehensive  in some material only unmarried is .
hin: उस छोटे से देश में मानो इतनी खूबसूरती समाती नहीं ।
eng: ovary small from country in  omniavailable   no .
hin: बाजार की उस सैर से वापस अपने क्रूज पर जाने के लिए जेट्‍टी की तरफ लौटते हुए , जब निगाहें बरबस सड़क के साथ-साथ चलते समंदर के पानी में गई तो अहसास हुआ कि यहाँ के लोग इस खूबसूरती की कीमत किस हद तक समझते हैं ।
eng: market comprehensive ovary routine from burps 's  on consequence near order  comprehensive cheeks  running , when   road near beautiful congenital  near water in hypertension then  happened that here near people this  comprehensive  dilemma extent till  are .
hin: पानी इतना साफ था कि लैंपपोस्ट की रोशनी में नीचे कई मीटर गहराई में तैरती मछलियाँ अपने सारे रंगों के साथ दिखाई दे रही थीं ।
eng: water endurance clean had that  comprehensive sight in below many metre depths in   's entire  near along visible pass proving seventeenth .
hin: यह समंदर का वो हिस्सा था जो शहर से एकदम सटा हुआ था ।
eng: this  original

eng: many times then estimate that now  .
hin: किसी तरह सुबह 8 बजे गजरौला पहुँचे ।
eng: any sticky morning 8 secure   .
hin: खैर हम 12 बजे रामनगर पहुँच गए ।
eng:  we 12 secure  harmed went .
hin: कार्बेट पार्क रामनगर से 20 किलोमीटर आगे है ।
eng:    from 20  front is .
hin: हमारा रिजॉर्ट पार्क के नजदीक ढिकुली में था , जिसकी बुकिंग हमने पहले ही करवा दी थी ।
eng: crore   near nearby  in had , decrement  origin before only heard given 2000 .
hin: उस दिन कुछ खास कार्यक्रम था नहीं हम ठंड होने के बावजूद पहले कोसी के किनारे घूमने गए ।
eng: ovary day some unmarried program had no we sunlight occurring near  before  near sides  went .
hin: दो - तीन घंटे वहाँ बिताने के बाद हम कार्बेट म्यूजियम निकल गए , जो बड़ी मुश्‍किल 2 किमी दूर धांगड़ी गेट के नजदीक था ।
eng: two - three hours central spend near after we   unidirectional went , whatever big  2  away   near nearby had .
hin: वहाँ से हम नीचे रामनगर घूमने चले गए ।
eng: central from we below   links went .
hin: लौटते देर शाम हो गई ।
eng:  late evening

In [41]:
from nltk.translate.ibm1 import IBMModel1
from nltk.translate import AlignedSent


def get_text(filename):
    sentences = []
    with open(filename,'r',encoding="utf8") as f:
        for sentence in f:
            sentences.append(sentence.split())
    return sentences

src_sentences = get_text('english test.txt')
trg_sentences = get_text('hindi test.txt')

bitext = []
for i in range(len(src_sentences)):
    bitext.append(AlignedSent(src_sentences[i], trg_sentences[i]))

ibm1 = IBMModel1(bitext, 5)

In [42]:
ibm1.train(bitext)

In [43]:
for i in bitext:
    print(i)

<AlignedSent: 'Fresh breath and shi...' -> 'ताजा साँसें और चमचमा...'>
<AlignedSent: 'Your self-confidence...' -> 'दाँतों से आपका आत्मव...'>
<AlignedSent: 'Bacteria stay betwee...' -> 'हमारे मसूढ़ों और दाँ...'>
<AlignedSent: 'They make teeth dirt...' -> 'ये दाँतों को गंदा और...'>
<AlignedSent: 'You may keep your te...' -> 'यहाँ दिए कुछ आसान नु...'>
<AlignedSent: 'Clean your teeth pro...' -> 'दाँतों को ठीक से साफ...'>
<AlignedSent: 'It takes two to thre...' -> 'दाँतों को ठीक से साफ...'>
<AlignedSent: 'But most of the peop...' -> 'लेकिन ज्यादातर लोग इ...'>
<AlignedSent: 'Drink plenty of wate...' -> 'खूब पानी पीएँ ।...'>
<AlignedSent: 'Bacteria attack fast...' -> 'मुँह सूखने पर बैक्टी...'>
<AlignedSent: 'With this stink come...' -> 'इससे साँसों से बदबू ...'>
<AlignedSent: 'By drinking plenty o...' -> 'खूब पानी पीने से न क...'>
<AlignedSent: 'Saliva has important...' -> 'मुँह साफ रखने में ला...'>
<AlignedSent: 'Saliva destroys thos...' -> 'लार उन बैक्टीरिया को...'>
<AlignedSent: 'Chew the s

In [44]:
test_sentence = bitext[112]
print(test_sentence.words)
print(test_sentence.mots)
print(test_sentence.alignment)

['All', 'salesmen', 'were', 'listening', 'carefully', 'the', 'speech', 'given', 'by', 'president', 'Mohammad', 'Nasheed', 'in', 'World', 'climate', 'conference', 'a', 'day', 'before', 'in', 'Copenhagen', '.']
['सारे', 'सेल्समैन', 'मालदीव', 'के', 'राष्\u200dट्रपति', 'मोहम्मद', 'नाशीद', 'द्वारा', 'एक', 'दिन', 'पहले', 'कोपेनहेगन', 'में', 'विश्\u200dव', 'जलवायु', 'सम्मेलन', 'में', 'दिए', 'गए', 'भाषण', 'को', 'बड़े', 'ध्यान', 'से', 'सुन', 'रहे', 'थे', '।']
0-24 1-24 2-26 3-19 4-22 5-3 6-19 7-17 8-7 9-19 10-24 11-19 12-16 13-24 14-24 15-24 16-25 17-9 18-10 19-16 20-24 21-27


In [45]:
# bitext.append(AlignedSent(['what is Education'], ['शिक्षा क्या है']))

In [46]:
ibm1 = IBMModel1(bitext, 15)

In [47]:
print(ibm1.translation_table['what']['क्या'])

0.10344207667425154


In [48]:
for i in bitext:
  print(i.words)
  print(i.mots)

['Fresh', 'breath', 'and', 'shining', 'teeth', 'enhance', 'your', 'personality', '.']
['ताजा', 'साँसें', 'और', 'चमचमाते', 'दाँत', 'आपके', 'व्यक्तित्व', 'को', 'निखारते', 'हैं', '।']
['Your', 'self-confidence', 'also', 'increases', 'with', 'teeth', '.']
['दाँतों', 'से', 'आपका', 'आत्मविश्\u200dवास', 'भी', 'बढ़ता', 'है', '।']
['Bacteria', 'stay', 'between', 'our', 'gums', 'and', 'teeth', '.']
['हमारे', 'मसूढ़ों', 'और', 'दाँतों', 'के', 'बीच', 'बैक्टीरिया', 'मौजूद', 'होते', 'हैं', '।']
['They', 'make', 'teeth', 'dirty', 'and', 'breath', 'stinky', '.']
['ये', 'दाँतों', 'को', 'गंदा', 'और', 'साँसों', 'को', 'बदबूदार', 'बना', 'देते', 'हैं', '।']
['You', 'may', 'keep', 'your', 'teeth', 'clean', 'and', 'breath', 'fresh', 'by', 'the', 'help', 'of', 'some', 'easy', 'tips', 'given', 'here', '.']
['यहाँ', 'दिए', 'कुछ', 'आसान', 'नुस्खों', 'की', 'मदद', 'से', 'आप', 'अपने', 'दाँतों', 'को', 'स्वच्छ', 'और', 'साँसों', 'को', 'ताजा', 'रख', 'सकते', 'हैं', '।']
['Clean', 'your', 'teeth', 'properly', '.']
['दाँतों