In [1]:
from __future__ import division
from collections import defaultdict
from nltk.translate import AlignedSent
from nltk.translate import Alignment
from nltk.translate import IBMModel
from nltk.translate.ibm_model import Counts
import warnings

In [2]:
class IBMModel1(IBMModel):
    def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
        """
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int

        :param probability_tables: Optional. Use this to pass in custom
            probability values. If not specified, probabilities will be
            set to a uniform distribution, or some other sensible value.
            If specified, the following entry must be present:
            ``translation_table``.
            See ``IBMModel`` for the type and purpose of this table.
        :type probability_tables: dict[str]: object
        """
        super(IBMModel1, self).__init__(sentence_aligned_corpus)

        if probability_tables is None:
            self.set_uniform_probabilities(sentence_aligned_corpus)
        else:
            # Set user-defined probabilities
            self.translation_table = probability_tables['translation_table']

        for n in range(0, iterations):
            self.train(sentence_aligned_corpus)

        self.align_all(sentence_aligned_corpus)

    def set_uniform_probabilities(self, sentence_aligned_corpus):
        initial_prob = 1 / len(self.trg_vocab)
        if initial_prob < IBMModel.MIN_PROB:
            warnings.warn(
                "Target language vocabulary is too large ("
                + str(len(self.trg_vocab))
                + " words). "
                "Results may be less accurate."
            )

        for t in self.trg_vocab:
            self.translation_table[t] = defaultdict(lambda: initial_prob)


    def train(self, parallel_corpus):
        counts = Counts()
        for aligned_sentence in parallel_corpus:
            trg_sentence = aligned_sentence.words
            src_sentence = [None] + aligned_sentence.mots

            # E step (a): Compute normalization factors to weigh counts
            total_count = self.prob_all_alignments(src_sentence, trg_sentence)

            # E step (b): Collect counts
            for t in trg_sentence:
                for s in src_sentence:
                    count = self.prob_alignment_point(s, t)
                    normalized_count = count / total_count[t]
                    counts.t_given_s[t][s] += normalized_count
                    counts.any_t_given_s[s] += normalized_count

        # M step: Update probabilities with maximum likelihood estimate
        self.maximize_lexical_translation_probabilities(counts)


    def prob_all_alignments(self, src_sentence, trg_sentence):
        """
        Computes the probability of all possible word alignments,
        expressed as a marginal distribution over target words t

        Each entry in the return value represents the contribution to
        the total alignment probability by the target word t.

        To obtain probability(alignment | src_sentence, trg_sentence),
        simply sum the entries in the return value.

        :return: Probability of t for all s in ``src_sentence``
        :rtype: dict(str): float
        """
        alignment_prob_for_t = defaultdict(lambda: 0.0)
        for t in trg_sentence:
            for s in src_sentence:
                alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
        return alignment_prob_for_t


    def prob_alignment_point(self, s, t):
        """
        Probability that word ``t`` in the target sentence is aligned to
        word ``s`` in the source sentence
        """
        return self.translation_table[t][s]


    def prob_t_a_given_s(self, alignment_info):
        """
        Probability of target sentence and an alignment given the
        source sentence
        """
        prob = 1.0

        for j, i in enumerate(alignment_info.alignment):
            if j == 0:
                continue  # skip the dummy zeroeth element
            trg_word = alignment_info.trg_sentence[j]
            src_word = alignment_info.src_sentence[i]
            prob *= self.translation_table[trg_word][src_word]

        return max(prob, IBMModel.MIN_PROB)


    def align_all(self, parallel_corpus):
        for sentence_pair in parallel_corpus:
            self.align(sentence_pair)


    def align(self, sentence_pair):
        """
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The best alignment will be set in ``sentence_pair`` when the
        method returns. In contrast with the internal implementation of
        IBM models, the word indices in the ``Alignment`` are zero-
        indexed, not one-indexed.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent
        """
        best_alignment = []

        for j, trg_word in enumerate(sentence_pair.words):
            # Initialize trg_word to align with the NULL token
            best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
            best_alignment_point = None
            for i, src_word in enumerate(sentence_pair.mots):
                align_prob = self.translation_table[trg_word][src_word]
                if align_prob >= best_prob:  # prefer newer word in case of tie
                    best_prob = align_prob
                    best_alignment_point = i

            best_alignment.append((j, best_alignment_point))

        sentence_pair.alignment = Alignment(best_alignment)


In [3]:
limit = 10000
def ReadData(fileName):
    i=0
    tokenizeDataList = []
    with open(fileName,"r", encoding="utf-8") as f:
        for line in f:
            temp = line.split()
            tokenizeDataList.append(temp)
            i+=1
            if i == limit:
                break
    return tokenizeDataList

germanDataList = ReadData("europarl-v7.de-en.de")
print(len(germanDataList))


10000


In [11]:
germanDataList[0]

['Die',
 'Stadt,',
 'in',
 'der',
 'ich',
 'lebe,',
 'ist',
 'sehr',
 'klein.',
 'Es',
 'hat',
 'nur',
 '4500',
 'Einwohner',
 'und',
 'ist',
 'von',
 'Ackerland',
 'und',
 'Wäldern',
 'umgeben.',
 'Wir',
 'haben',
 'auch',
 'einen',
 'schönen',
 'See,',
 'der',
 'eine',
 'große',
 'Attraktion',
 'des',
 'Sommers',
 'ist,',
 'wenn',
 'viele',
 'Touristen',
 'in',
 'den',
 'Urlaub',
 'kommen.',
 'Ich',
 'bin',
 'im',
 'Winter',
 'auf',
 'dem',
 'See',
 'Schlittschuh',
 'gefahren,',
 'als',
 'ich',
 'jünger',
 'war',
 'und',
 'die',
 'Winter',
 'kälter',
 'waren.',
 'Sie',
 'können',
 'dort',
 'schwimmen,',
 'baden',
 'oder',
 'windsurfen',
 'und',
 'sogar',
 'Wakeboarden',
 'lernen.',
 'Meine',
 'Stadt',
 'ist',
 'nicht',
 'weit',
 'von',
 'der',
 'Großstadt',
 'entfernt,',
 'so',
 'dass',
 'die',
 'Leute',
 'dort',
 'auch',
 'einkaufen',
 'können.',
 'Mit',
 'dem',
 'Zug',
 'sind',
 'es',
 'nur',
 '15',
 'Minuten.']

In [6]:
englishDataList = ReadData("europarl-v7.de-en.en")
print(len(englishDataList))

10000


In [7]:
#englishDataList[0]

In [8]:
trainData = []
for g,e in zip(germanDataList, englishDataList):
    trainData.append(AlignedSent(g, e))

In [9]:
ibm1 = IBMModel1(trainData[:10000], 5)

In [10]:
test_sentence = trainData[0]
test_sentence.mots

['the',
 'town',
 'I',
 'live',
 'in',
 'is',
 'quiet',
 'small',
 '.',
 'It',
 'only',
 'has',
 '4500',
 'inhabitants',
 'and',
 'is',
 'surrounded',
 'by',
 'farmland',
 'and',
 'forests.',
 'we',
 'also',
 'have',
 'a',
 'nice',
 'lake',
 'which',
 'is',
 'great',
 'attraction',
 'of',
 'summer',
 'when',
 'many',
 'tourists',
 'come',
 'for',
 'holidays.I',
 'used',
 'to',
 'ice',
 'skate',
 'on',
 'the',
 'lake',
 'in',
 'winter',
 'when',
 'I',
 'was',
 'younger',
 'and',
 'winters',
 'were',
 'colder.',
 'You',
 'can',
 'go',
 'swimming,',
 'salling',
 'or',
 'windsurfing',
 'there',
 'and',
 'you',
 'even',
 'can',
 'learn',
 'how',
 'to',
 'wakeboard.',
 'My',
 'town',
 'is',
 'not',
 'far',
 'from',
 'big',
 'city',
 'so',
 'people',
 'can',
 'go',
 'shopping',
 'there',
 'as',
 'well',
 'as',
 'it',
 'only',
 'takes',
 '15',
 'minutes',
 'by',
 'train.Resumption',
 'of',
 'the',
 'session']

In [14]:
print(ibm1.translation_table['Stadt']['town'])

0.018794825765294416
