In [47]:
import sys
from operator import itemgetter
from copy import deepcopy


def get_corpus(filename):
    '''load corpus located at filename'''
    with open(filename, 'r') as myfile:
        corpus = [next(myfile) for x in range(1000)]
    
    if VERBOSE:
        print(corpus, file=sys.stderr)
    return corpus

def get_words(en_corpus, hi_corpus):
    
    def source_words(lang):
        corpus = en_corpus if lang == 'en' else hi_corpus
        for sentence in corpus:
            for word in sentence.split():
                yield word
    
    return {lang: set(source_words(lang)) for lang in ('en', 'hi')} 

def summarize_results(trans_prob):
    return {
        k: sorted(v.items(), key=itemgetter(1), reverse=True)[0][0] for (k, v) in trans_prob.items()
    }

def train_iteration(en_corpus, hi_corpus, words, s_total, prev_trans_prob):
    
    trans_prob = deepcopy(prev_trans_prob)

    count = {word_en: {word_hi: 0 for word_hi in words['hi']} for word_en in words['en']} #count(en|hi) = 0

    total = {word_hi: 0 for word_hi in words['hi']} #total(hi) = 0 

    for (ens, his) in [(en_corpus[i].split(), hi_corpus[i].split()) for i in range(1000)]:
        for en in ens:
            s_total[en] = 0

            for hi in his:
                s_total[en] += trans_prob[en][hi]

        for en in ens:
            for hi in his:
                count[en][hi] += (trans_prob[en][hi] / s_total[en])
                total[hi] += trans_prob[en][hi] / s_total[en]

    for hi in words['hi']:
        for en in words['en']:
            trans_prob[en][hi] = count[en][hi] / total[hi]

    return trans_prob


if __name__ == '__main__':
    
    en_corpus = get_corpus('./train.en')
    hi_corpus = get_corpus('./train.hi')
    
    words = get_words(en_corpus, hi_corpus)
    
    print(words)
    
    s_total = {word_en: 0 for word_en in words['en']}
    
    #init trans prob(uniform distribution)
    prev_trans_prob = {word_en: {word_hi: 1/len(words['hi']) for word_hi in words['hi']} for word_en in words['en']}

    converged = False
    iterations = 0
    
    while iterations <= 100:
        iterations += 1
        trans_prob = train_iteration(en_corpus, hi_corpus, words, s_total, prev_trans_prob)
        iter_table = summarize_results(trans_prob)
        if converged:
            pass
        prev_trans_prob = trans_prob
        
    print(summarize_results(trans_prob))








{'en': {'last', 'appointed', 'while', 'differences', 'number', 'Nyctalopia', 'paramedical', 'incidence', 'floor', 'mouths', 'else', 'knot', 'sanitation', 'Centre', 'ECCE-IOL', 'or', 'attention', 'National', 'bronchitis', 'above', 'institutes', 'tunnel', 'filtering', 'wound', 'eat', 'extra', 'disease', 'complication', 'highest', 'Population', 'stitch', 'cold', 'Tetanus', 'diagnostic', 'adding', 'precautions', 'giving', 'coughing', '21', 'Absorption', 'related', 'services', 'medical-philosophy', 'microbacterium', 'discolored', 'sent', 'issues', 'reforms', 'distribution', 'service', 'thought', 'measures', 'emotions', 'Oily', 'experienced', 'complains', 'quality', 'power', 'average', 'windpipe', 'Even', 'partners', 'adults', 'works', 'fluids', 'scale', 'vessels', 'nerves', 'secure', 'therefore', 'freckle', 'earliest', 'Mental', 'soda', 'Entamoeba', 'fitted', 'typhoid', 'Satvik', 'Dehradun', 'hysterectomy', 'Special', 'wealth', 'unexpected', 'basically', 'red', 'anemia', 'Fingers', 'waist',

{'last': 'अन्धापन', 'appointed': 'तैनात', 'while': 'जबकि', 'differences': 'विभिन्नताएँ', 'number': 'संख्या', 'Nyctalopia': 'रतौंधी', 'paramedical': 'पंचायती', 'incidence': 'प्रिवलेस', 'floor': 'खास', 'mouths': 'संकेतों', 'else': 'ब्लडप्रेशर', 'knot': 'गाँठ', 'sanitation': 'Environmental', 'Centre': 'माईक्रोस्कोपिक', 'ECCE-IOL': 'टनल', 'or': 'या', 'attention': 'ध्यान', 'National': 'योजना', 'bronchitis': 'जिनसे', 'above': 'उपरोक्त', 'institutes': 'संस्थानों', 'tunnel': 'टनल', 'filtering': 'प्रयत्\u200dन', 'wound': 'घाव', 'eat': 'सेवन', 'extra': 'दबाव', 'disease': 'बीमारी', 'complication': 'विकृत', 'highest': 'उच्चतम', 'Population': 'पंचायती', 'stitch': 'एन.एस.वी.', 'cold': 'सर्दी', 'Tetanus': 'धूल', 'diagnostic': 'आर.एफ.टी.', 'adding': 'सा', 'precautions': 'क्या-क्या', 'giving': 'देकर', 'coughing': 'खाँसने', '21': '21', 'Absorption': 'Absorption', 'related': 'सम्बन्धित', 'services': 'सेवाओं', 'medical-philosophy': 'चिकित्सा-दर्शन', 'microbacterium': 'माइक्रोबैक्टीरियम', 'discolored': 'चु