In [1]:
import numpy as np
import time
import pickle
import operator
from __future__ import print_function
from IPython.display import display
import matplotlib.pyplot as plt
import plotly.plotly as py
%matplotlib inline  

In [2]:
def load_file(filepath):
    """Reads a file into a list of phrases. Each phrase in the file must be separated with a new line character '\n' 
    Args:
        filepath (str): the relevant filepath of the file
    Returns:
        phrases (list(str)): A list with all the phrases 
    """
    with open(filepath, 'r') as f:
        phrases = f.readlines()
    return phrases

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


In [3]:
SMALL_VALUE = 0.000001

class Decoder:
    def __init__(self, trg_lm_file, tm_file, rm_file):
        
        self.distortion_limit = 3
        
        
        print('Reading target language model...', end = '')
        self.trg_LM = self.parse_lm_file(trg_lm_file)
        print('Done!')
        
        print('Reading translation model...', end = '')
        self.TM = self.parse_tm_file(tm_file)
        print('Done!')
       
        print('Reading reorderings model...', end = '')
        self.RM = self.parse_rm_file(rm_file)
        print('Done!')
        
        print('Processing...', end = '')
        
        
        #--------------Calculate the avg probability of each orientation -------------------
        
        self.RM_avg = [0] * 6
        
        #accumulate probabilities
        for phrase, value in self.RM.items():
            for i in range(6):
                self.RM_avg[i] += value[i]
        #average them
        self.RM_avg = [1.0 * c / len(self.RM) for c in self.RM_avg]
        
        #-----------------------------------------------------------------------------------
        
        
        
        #-------------calculate the avg probability of each orientation based on the source phrase length--------------------
        
        self.RM_LEN_THRES = 5
        length_counters = [0] * self.RM_LEN_THRES
        reordering_probabilities = [[0] * self.RM_LEN_THRES for i in range(6)]
        
        #accumulate probabilities
        for phrase, value in self.RM.items():
    
            src_len = len(phrase[0].split())
            src_len = min(src_len, self.RM_LEN_THRES)

            length_counters[src_len - 1] += 1

            for reordering_type in range(6):
                reordering_probabilities[reordering_type][src_len - 1] += value[reordering_type]
                
        #average them        
        for reordering_type in range(6):
            for src_len in range(self.RM_LEN_THRES):
                if length_counters[src_len] != 0:
                    reordering_probabilities[reordering_type][src_len] /= length_counters[src_len]
                    
        #transpose so that you index [length][reordering] instead of [reordering][length]
        self.RM_length =  map(list, zip(*reordering_probabilities))
        
        #---------------------------------------------------------------------------------------------------------------
         
        print('Done!')
        
        
    
    def parse_lm_file(self, filepath):
        lm_file = load_file(filepath)

        LM = {}

        for line in lm_file:
            tokens = line.split('\t')
            if len(tokens) > 0:
                
                if is_number(tokens[0]):

                    if(len(tokens) == 3):
                        LM[tokens[1]] = [float(tokens[0]), float(tokens[2])]
                    else:
                        LM[tokens[1]] = [float(tokens[0])]

        return LM


    def parse_tm_file(self, filepath):
        tm_file = load_file(filepath)

        TM = {}

        for line in tm_file:
            tokens = line.split("|||")

            f = tokens[0].strip()
            e = tokens[1].strip()

            tokens = tokens[2].split()


            p_fe = float(tokens[0])
            l_fe = float(tokens[1])
            p_ef = float(tokens[2])
            l_ef = float(tokens[3])
            wp = float(tokens[4])

            TM[(f,e)] = [p_fe, l_fe, p_ef, l_ef, wp]

        return TM


    def parse_rm_file(self, filepath):
        re_file = load_file(filepath)

        reorderings = {}

        for line in re_file:
            tokens = line.split("|||")

            f = tokens[0].strip()
            e = tokens[1].strip()

            tokens = tokens[2].split()

            m_rl = float(tokens[0])
            s_rl = float(tokens[1])
            d_rl = float(tokens[2])

            m_lr = float(tokens[3])
            s_lr = float(tokens[4])
            d_lr = float(tokens[5])

            reorderings[(f,e)] = [m_rl, s_rl, d_rl, m_lr, s_lr, d_lr]

        return reorderings
    
    
    def parse_trace(self, trace_line):
        
        tokens = trace_line.split("|||")
        
        trace = []
        for token in tokens: 
            [src_positions, translation] = token.split(':',1) 
            
            src_positions = [int(pos) for pos in src_positions.split("-")]
            
            trace.append([src_positions[0], src_positions[1], translation.strip()])
        
        return trace
    
    
    
    def ngram_prob(self, ngram_words, LM):
        
        #the empty ngram, called when the 1-gram is not in the vocab
        if not ngram_words:
            return np.log10(SMALL_VALUE)

        ngram_str = " ".join(ngram_words)

        if ngram_str in LM:
            return LM[ngram_str][0]
        else:
            backoff = 0
            backoff_str = " ".join(ngram_words[:-1])
            if backoff_str in LM:
                if len(LM[backoff_str]) == 2:
                    backoff = LM[backoff_str][1]

            return self.ngram_prob(ngram_words[1:],LM) + backoff 
    

    def get_lm_cost(self, phrase, hist, LM):
        lm_cost = 0

        hist_words = hist.split()
        phrase_words = phrase.split()

        for word in phrase_words:

            lm_cost += self.ngram_prob(hist_words + [word], LM)

            if len(hist_words) == 4:
                del hist_words[0]

            hist_words.append(word)

        return lm_cost



    def get_translation_cost(self, src_phrase, trace):

        PHRASE_PENALTY = -1
        LM_SIZE = 4

        W_PFE = 1.0 # TM weight for p(f|e) + p(e|f) 
        W_LFE = 1.0 # TM weight for l(f|e) + l(e|f)
        W_WP = 1.0 # Word Penalty weight
        W_LM = 1.0 # Language Model weight
        W_LD = 1.0 # Linear Distortion weight 

        # Reordering Model weights, individual weights for each direction and orientation => 6 weight parameters
        W_RM_C_1 = 1.0
        W_RM_C_2 = 1.0
        W_RM_S_1 = 1.0
        W_RM_S_2 = 1.0
        W_RM_D_1 = 1.0
        W_RM_D_2 = 1.0

        total_cost = 0

        #maybe add (n,n,</s>) in the trace?
        trace = self.parse_trace(trace) 
        src_words = src_phrase.split()

        src_phrases = [] 
        for p in trace:
            src_phrase = " ".join(src_words[p[0]:p[1] + 1])
            src_phrases.append(src_phrase) 


        trace.append([len(src_words), len(src_words), '</s>'])
        src_words.append('</s>')
        src_phrases.append('</s>')



        for i in range(len(trace)):

            tm_cost = 0   #Translation Model cost
            lm_cost = 0   #Language Model cost
            ld_cost = 0   #Linear Distorion cost
            rm_cost = 0   #Reordering Model cost
            pp_cost = 0   #Phrase penalty cost

            pair = (src_phrases[i], trace[i][2])

            #-----------------------------TM COST-------------------------------------
            if i != len(trace) - 1: #do not compute the cost for </s> </s>

                if pair in self.TM:   
                    [p_fe, lex_fe, p_ef, lex_ef, wp] = self.TM[pair]

    #                 h_TM = p(f|e) + p(e|f) (phrase translation)
                    tm_cost += W_PFE * (np.log10(p_fe) + np.log10(p_ef))

                    #h_TM = l(f|e) + l(e|f) (lexical phrase translation)
                    tm_cost += W_LFE * (np.log10(lex_fe) + np.log10(lex_ef))

                    #h_wp (word penalty)
                    tm_cost += W_WP * wp
                else: 
                    tm_cost = np.log10(SMALL_VALUE)

            #---------------------------------------------------------------------

            #-----------------------------LM COST---------------------------------

            #Build the history of the target phrase, up to (n-1) words ('n' for maximum n-gram used by the LM)
            trg_hist = []
            hist_count = 0
            j = i-1
            while j >= 0:
                prev_trg_words = trace[j][2].split()
                for trg_word in reversed(prev_trg_words):
                    trg_hist.insert(0, trg_word)
                    hist_count += 1

                    if hist_count == LM_SIZE:
                        break;

                if hist_count == LM_SIZE:
                    break;
                j -= 1

            #Add the start symbol '</s>' if the history is not big enough
            if hist_count < LM_SIZE:
                trg_hist.insert(0, '<s>')

            trg_hist = " ".join(trg_hist)

            lm_cost = W_LM * self.get_lm_cost(trace[i][2], trg_hist, self.trg_LM)

            #------------------------------------------------------------------------------


            #--------------------Distortion cost ----------------------------

            #h_LD (Distortion)
            if i == 0:
                ld_cost = -1 * trace[i][0] 
            else:
                ld_cost = -1 * (trace[i][0] - trace[i-1][1] - 1)
            ld_cost = W_LD * ld_cost

            #---------------------------------------------------------------------------

            #---------------------------Reordering cost--------------------------------

            #h_RM = p_lr + p_rl(Reordering penalty)

            if i == 0:
                prev_pair = ('<s>', '<s>')
                prev_start = -1
                prev_end = -1
            else:
                prev_pair = (src_phrases[i-1], trace[i-1][2])
                prev_start = trace[i-1][0]
                prev_end = trace[i-1][1]


            #get the orientation probabilites of the current pair for r->l
            if pair in self.RM:
                [c_rl, s_rl, d_rl] =  self.RM[pair][:3]
            else:
                #fixed big cost
                [c_rl, s_rl, d_rl] = [SMALL_VALUE] * 3

    #             uniform prob
                [c_rl, s_rl, d_rl] = [1/3.0] * 3

                #avg prob cost
                [c_rl, s_rl, d_rl] = self.RM_avg[:3]

                #avg prob | length cost
                src_len = len(pair[0].split())
                src_len = min(src_len, self.RM_LEN_THRES)       
                [c_rl, s_rl, d_rl] = self.RM_length[src_len][:3]


            #get the orientation probabilites of the previous pair for l->r
            if prev_pair in self.RM:
                [c_lr, s_lr, d_lr] = self.RM[prev_pair][3:]
            else:
                #fixed big cost
                [c_lr, s_lr, d_lr] = [SMALL_VALUE] * 3

                #uniform cost
                [c_lr, s_lr, d_lr] = [1/3.0] * 3

                #avg prob cost
                [c_lr, s_lr, d_lr] = self.RM_avg[3:]

                #avg prob | length cost
                src_len = len(prev_pair[0].split())
                src_len = min(src_len, self.RM_LEN_THRES)
                [c_lr, s_lr, d_lr] = self.RM_length[src_len][3:]


            # replace 0 probabilities and convert to log10 prob    
            probs_all = [c_rl, s_rl, d_rl, c_lr, s_lr, d_lr]
            for k in range(len(probs_all)):
                if probs_all[k] == 0:
                    probs_all[k] = SMALL_VALUE

            [c_rl, s_rl, d_rl, c_lr, s_lr, d_lr] = [np.log10(v) for v in probs_all]



            #calculate RM cost
            #continuous     
            if trace[i][0] == prev_end + 1:
                rm_cost += W_RM_C_1 * c_rl + W_RM_C_2 * c_lr
            #swap
            elif trace[i][1] == prev_start - 1:
                rm_cost += W_RM_S_1 * s_rl + W_RM_S_2 * s_lr
            #discontinuous
            else:
                rm_cost += W_RM_D_1 * d_rl + W_RM_D_2 * d_lr

            #------------------------------------------------------------------------------

            if i < len(trace) - 1:
                pp_cost = PHRASE_PENALTY
            else:
                pp_cost = 0


            total_cost += tm_cost + lm_cost + rm_cost + pp_cost + ld_cost


        return total_cost




In [4]:
src_file = load_file('data/file.test.de')
trg_file = load_file('data/file.test.en') 
traces =  load_file('data/testresults.trans.txt.trace')

decoder = Decoder('data/file.en.lm', 'data/phrase-table', 'data/dm_fe_0.75')

Reading target language model...Done!
Reading translation model...Done!
Reading reorderings model...Done!
Processing...Done!


In [7]:
output_file = open('./data/results.txt', 'w')

for i in range(len(src_file)):
    cost = decoder.get_translation_cost(src_file[i], traces[i])
    print(i)
    print('SOURCE:')
    print(src_file[i].strip())
    print()
    print('REFERENCE:')
    print(trg_file[i].strip())
    print()
    print('TRANSLATION:')
    print(" ".join([p[2] for p in decoder.parse_trace(traces[i])]))
    print()
    print('COST:', cost)
    print()
    print('-----------------------------------------------------------------------------------------------')
    
    output_file.write(str(cost) + '\n')
    
output_file.close()

0
SOURCE:
die arbeitsbedingungen für ärzte in der ausbildung gleichen einer horrorgeschichte aus einem dickens - roman des neunzehnten jahrhunderts .

REFERENCE:
the working arrangements for junior doctors are like a horror story from a dickensian novel of the nineteenth century .

TRANSLATION:
the working conditions for junior doctors same a horrorgeschichte from a dickens - a novel of neunzehnten century .

COST: -100.263728491

-----------------------------------------------------------------------------------------------
1
SOURCE:
es ist schon recht merkwürdig , daß wir an der schwelle des 21 . jahrhunderts noch immer für vernünftige regelungen in bezug auf den gesundheits - und arbeitsschutz kämpfen müssen .

REFERENCE:
i find it bizarre that at the beginning of the 21st century we are still arguing for common sense with regard to health and safety .

TRANSLATION:
it is right curious that we entering the 21st century still for reasonable arrangements relating to the health and arb

-----------------------------------------------------------------------------------------------
68
SOURCE:
dies geschah durch ereignisse und politischen druck ; eine anerkennung der verantwortlichkeiten , die wir in mittel - und osteuropa wahrnehmen müssen - eine anerkennung , die auf die art und weise zurückgeht , in der wir unsere aufgaben im mittelmeerraum erfüllt haben .

REFERENCE:
that has been driven by events and by political pressures ; a recognition of the responsibilities we should discharge in central and eastern europe - the recognition which that triggered of our responsibilities in the mediterranean .

TRANSLATION:
this was done by events and political pressure ; a recognition of responsibilities which we in central and eastern europe must - a recognition of the way part , in the we our tasks in mediterranean comply .

COST: -157.918056898

-----------------------------------------------------------------------------------------------
69
SOURCE:
es lohnt festzuhalten , d


-----------------------------------------------------------------------------------------------
152
SOURCE:
wenn herr patten in seiner einführung sagt , entweder wird das programm der kommission angenommen , oder wir werden zu kürzungen kommen müssen , dann finde ich das schon sehr befremdlich !

REFERENCE:
when mr patten says in his presentation that either the commission programme is accepted or we must make cuts , i find that highly displeasing .

TRANSLATION:
if mr patten in his introduction says , either that the commission programme approved or we will come to cuts must , then i think already very involving !

COST: -122.166191812

-----------------------------------------------------------------------------------------------
153
SOURCE:
wir sind nämlich hier die haushaltsbehörde , zumindest ein teil davon , und ich denke , da kann die kommission nicht einfach sagen , wir werden kürzen oder wir werden kürzen müssen !

REFERENCE:
it is we who are the budgetary authority , or at l

COST: -112.07633579

-----------------------------------------------------------------------------------------------
278
SOURCE:
die santa mafalda traf am 10 . mai 2000 im hafen von aveiro ein , wo an der gründlichen inspektion des schiffes zwei inspektoren der kommission teilnahmen .

REFERENCE:
the santa mafalda arrived in aveiro on 10 may 2000 and two commission inspectors participated in a thorough inspection of the vessel .

TRANSLATION:
the santa mafalda took on 10 may 2000 . in port of aveiro a where the substantial inspection carried out of schiffes two inspectors the commission was .

COST: -161.478336068

-----------------------------------------------------------------------------------------------
279
SOURCE:
die kommission hat die kanadischen behörden auf die vorgehensweise der kanadischen inspektoren bei der beschlagnahme der unterlagen und während ihres fortgesetzten aufenthalts an bord der santa mafalda aufmerksam gemacht .

REFERENCE:
the commission has drawn the atten

399
SOURCE:
so hat es der französische präsident dargelegt , und das wird natürlich ein diskussionspunkt für die nächste vorsitzperiode sein .

REFERENCE:
the french president has highlighted this and it will obviously be a point for debate during the next presidency .

TRANSLATION:
it has the french president , and the will of course a ongoing for the next presidency .

COST: -76.7707698184

-----------------------------------------------------------------------------------------------
400
SOURCE:
ich stimme ihrer these nicht zu , daß die entwicklung des euro einen rückgang der beschäftigung in europa und einen zuwachs an arbeitsplätzen in den vereinigten staaten bedeutet .

REFERENCE:
i do not agree with the theory that the euro 's development is leading to a fall in employment in europe and an increase in employment in the usa .

TRANSLATION:
i agree their that not that the development of the euro a reduction of employment in europe and a increase in jobs in the united states .

COS