## Word Probability by Subramanyam Sahoo

In [1]:
import numpy as np
import math
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.translate.metrics import alignment_error_rate
from nltk.metrics.scores import (precision, recall)
from operator import itemgetter

In [2]:
#DATA
file = open('./train.en', 'r', encoding="utf8")
train_en = file.read()
raw_sentences_train_en = train_en.split("\n")

file = open('./train.hi', 'r', encoding="utf8")
train_hi = file.read()
sentences_train_hi = train_hi.split("\n")

file = open('./test.en', 'r', encoding="utf8")
test_en = file.read()
raw_sentences_test_en = test_en.split("\n")

file = open('./test.hi', 'r', encoding="utf8")
test_hi = file.read()
sentences_test_hi = test_hi.split("\n")

file = open('./dev.en.txt', 'r', encoding="utf8")
dev_en = file.read()
raw_sentences_dev_en = dev_en.split("\n")

file = open('./dev.hi', 'r', encoding="utf8")
dev_hi = file.read()
sentences_dev_hi = dev_hi.split("\n")

In [3]:
def make_lower_case(data):
    list_ = []
    for sentence in data:
        list_.append(sentence.lower())
    return list_

In [4]:
sentences_dev_en = make_lower_case(raw_sentences_dev_en)
sentences_test_en = make_lower_case(raw_sentences_test_en)
sentences_train_en = make_lower_case(raw_sentences_train_en)

In [5]:
def is_converged(new, old, epoch):
    epsilone = 0.00000001
      
    if epoch < 15:
        return False
    return True

In [6]:
def perform_EM(en_sentences, hi_sentences):
    
    uni_ini = 0.00001
    
    translation_prob = defaultdict(lambda: defaultdict(lambda: float(uni_ini)))
    translation_prob_prev = defaultdict(lambda: defaultdict(float))
    
    epoch = 0
    
    while True:

        epoch += 1
        print("epoch num:", epoch,"\n")
        count = defaultdict(float)
        total = defaultdict(float)
        for index_sen, hin_sen in enumerate(hi_sentences):
            #compute normalization
            hin_sen_words = hin_sen.split(" ")
            s_total = defaultdict(float)
            for hin_word in hin_sen_words:
                s_total[hin_word] = 0
                eng_sen_words = en_sentences[index_sen].split(" ")
                for eng_word in eng_sen_words:
                        s_total[hin_word] += translation_prob[hin_word][eng_word]
            
            #collect counts
            for hin_word in hin_sen_words:
                eng_sen_words = en_sentences[index_sen].split(" ")
                for eng_word in eng_sen_words:
                        count[(hin_word, eng_word)] += translation_prob[hin_word][eng_word]/s_total[hin_word]
                        total[eng_word] += translation_prob[hin_word][eng_word]/s_total[hin_word]                   

        #estimate probabilities
        for (hin_word, eng_word) in count.keys():
                translation_prob[hin_word][eng_word] = count[(hin_word, eng_word)]/total[eng_word]

        if is_converged(translation_prob, translation_prob_prev, epoch) == True:
            break
        
        translation_prob_prev = translation_prob
        
        
    return translation_prob


In [7]:
def train_model(sentences_train_en, sentences_train_hi):
    
    translation_prob = perform_EM(sentences_train_en, sentences_train_hi)
    return translation_prob


In [8]:
def test_model(dataset, tef):
#     tef = np.load('./models/IBMmodel1tef.npy')
    file = open("dev_version2.txt", 'w+')
    translated_data = []
    for sentence in dataset:
        translation = translate_sentence(sentence, tef, file)
        translated_data.append(translation)
    
    file.close()
    return translated_data

In [9]:
def translate_sentence(sentence, tef, file):
    '''
    takes the best translation of an hindi word
    and appends to eng sentence
    '''
    eng_sentence = []
    print("hin:",sentence)
    
    file.write("hin: ")
    file.write(sentence)
    file.write("\n")
    
    tokens = sentence.split(" ")
    for idx, token in enumerate(tokens):
#         print("token id, token", idx, token)
        max_score = -1
        max_eng_word = ""
        probabilities = list(tef[token].values())
        eng_words = list(tef[token].keys())
        for tef_index, prob in enumerate(probabilities):
            if prob > max_score:
                max_score = prob
                max_eng_word = eng_words[tef_index]
        
        eng_sentence.append(max_eng_word)
    
    eng_sentence = " ".join(eng_sentence)
    print("eng:", eng_sentence)
    file.write("eng: ")
    file.write(eng_sentence)
    file.write("\n\n")
    return eng_sentence


In [10]:
tef = train_model(sentences_train_en, sentences_train_hi)

epoch num: 1 

epoch num: 2 

epoch num: 3 

epoch num: 4 

epoch num: 5 

epoch num: 6 

epoch num: 7 

epoch num: 8 

epoch num: 9 

epoch num: 10 

epoch num: 11 

epoch num: 12 

epoch num: 13 

epoch num: 14 

epoch num: 15 



In [11]:
# np.save("./models/IBMmodel1tef_3", tef)

In [12]:
for (fword, twtable) in tef.items():
    print('{0}'.format(fword))
    for (eword, prob) in sorted(twtable.items(), reverse=True, key=itemgetter(1)):
        if prob < 0.0001: continue # neglecting the extremely low possibilities
        print('{0}:{1:.4f}'.format(eword, prob))

मोतियाबिंद
cataract:0.8275
paternal:0.0003
father:0.0003
uncle:0.0003
somebody:0.0001
का
original:0.4864
estimation:0.4844
cranial:0.4748
turpentine:0.4197
flour:0.3776
use:0.3640
solution:0.3495
weather:0.3462
belongs:0.3205
shakiya:0.3205
real:0.3169
prevent:0.3082
clue:0.3079
kerosene:0.2931
coriander:0.2775
protect:0.2734
boiled:0.2674
certainly:0.2604
says:0.2573
pan:0.2519
gland:0.2511
cause:0.2508
organisation:0.2503
finding:0.2477
over:0.2461
production:0.2453
fifth:0.2443
respects:0.2428
definite:0.2416
shoo:0.2312
tai:0.2284
chi:0.2284
of:0.2189
try:0.2182
meaning:0.2178
administering:0.2169
authority:0.2149
dr:0.2056
garden:0.2048
sicily:0.2048
closest:0.2035
insecticides:0.2016
hygiene:0.1999
spells:0.1999
rapidly:0.1983
used:0.1979
changes:0.1836
realize:0.1834
mind:0.1825
vanishes:0.1809
circulation:0.1799
cabbage:0.1793
inverted:0.1769
flow:0.1759
etc.:0.1738
bone:0.1737
contact:0.1734
stew:0.1709
alone:0.1678
consumed:0.1653
eat:0.1644
paralysing:0.1643
vision:0.1628
tr

immediately:0.0520
lie:0.0460
complain:0.0234
finger:0.0147
burn:0.0039
protect:0.0001
लेटे
lie:0.0770
complain:0.0391
burn:0.0065
लेटने
lying:0.3712
हजम
digest:0.1468
digested:0.1128
anything:0.0118
intestines:0.0089
go:0.0025
घंटों
hours:0.2523
context:0.0002
few:0.0002
transfer:0.0001
झुके
bend:0.1115
downwards:0.1115
खायी
bend:0.1115
downwards:0.1115
सोने
antacid:0.2322
meal:0.0773
going:0.0502
hour:0.0012
before:0.0002
एंटासिट
antacid:0.2325
लिक्विड
liquid:0.2004
antacid:0.0419
एंटासिड
antacid:0.2325
फंसी
bottle:0.0910
lightly:0.0909
warming:0.0164
cotton:0.0010
gas:0.0009
हल्की
stroll:0.2892
bottle:0.0910
lightly:0.0909
piercing:0.0870
slight:0.0221
warming:0.0164
cotton:0.0010
gas:0.0009
slow:0.0002
मसाज
bottle:0.0910
lightly:0.0909
warming:0.0164
cotton:0.0010
gas:0.0009
हॉट
brandy:0.2631
hot:0.1753
bottle:0.0376
lightly:0.0376
warming:0.0068
containing:0.0014
soup:0.0010
enough:0.0007
cotton:0.0004
gas:0.0004
lots:0.0002
वॉटर
bottle:0.0910
lightly:0.0909
warming:0.0164
cotton:

In [13]:
# print(tef['तथा']['and'])