In [22]:
import numpy as np
import time
import pickle
import operator
from __future__ import print_function
from IPython.display import display
import matplotlib.pyplot as plt
import plotly.plotly as py
%matplotlib inline  

In [16]:
def load_file(filepath):
    """Reads a file into a list of phrases. Each phrase in the file must be separated with a new line character '\n' 
    Args:
        filepath (str): the relevant filepath of the file
    Returns:
        phrases (list(str)): A list with all the phrases 
    """
    with open(filepath, 'r') as f:
        phrases = f.readlines()
    return phrases

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


In [17]:
class Decoder:
    def __init__(self, trg_lm_file, tm_file, rm_file):
        
        self.trg_LM = self.parse_lm_file(trg_lm_file)
        self.TM = self.parse_tm_file(tm_file)
        self.RM = self.parse_rm_file(rm_file)
        
        self.distortion_limit = 3
        
        
    
    def parse_lm_file(self, filepath):
        lm_file = load_file(filepath)

        LM = {}

        for line in lm_file:
            tokens = line.split()
            if len(tokens) > 0:
                if is_number(tokens[0]):

                    p1 = float(tokens[0])

                    s = tokens[1:]

                    if is_number(s[-1]):
                        p2 = float(s[-1])
                        del s[-1]
                        LM[" ".join(s)] = [p1,p2]
                    else:    
                        LM[" ".join(s)] = [p1]

        return LM


    def parse_tm_file(self, filepath):
        tm_file = load_file(filepath)

        TM = {}

        for line in tm_file:
            tokens = line.split("|||")

            f = tokens[0].strip()
            e = tokens[1].strip()

            tokens = tokens[2].split()


            p_fe = float(tokens[0])
            l_fe = float(tokens[1])
            p_ef = float(tokens[2])
            l_ef = float(tokens[3])
            wp = float(tokens[4])

            TM[(f,e)] = [p_fe, l_fe, p_ef, l_ef, wp]

        return TM



    def parse_rm_file(self, filepath):
        re_file = load_file(filepath)

        reorderings = {}

        for line in re_file:
            tokens = line.split("|||")

            f = tokens[0].strip()
            e = tokens[1].strip()

            tokens = tokens[2].split()

            m_rl = float(tokens[0])
            s_rl = float(tokens[1])
            d_rl = float(tokens[2])

            m_lr = float(tokens[3])
            s_lr = float(tokens[4])
            d_lr = float(tokens[5])

            reorderings[(f,e)] = [m_rl, s_rl, d_rl, m_lr, s_lr, d_lr]

        return reorderings
    
    
    def parse_trace(self, trace_line):
        
        tokens = trace_line.split("|||")
        
        trace = []
        for token in tokens: 
            [src_positions, translation] = token.split(':',1) 
            
            src_positions = [int(pos) for pos in src_positions.split("-")]
            
            trace.append([src_positions[0], src_positions[1], translation.strip()])
        
        return trace
    
    
    def get_translation_cost(self, src_phrase, trace):
        
        trace = self.parse_trace(trace)
        src_words = src_phrase.split()
    
        src_phrases = []
        for p in trace:
            src_phrase = " ".join(src_words[p[0]:p[1] + 1])
            src_phrases.append(src_phrase)
            
      
        
        return cost
    
    
   
    

In [13]:
src_file = load_file('data/file.test.de')
trg_file = load_file('data/file.test.en') 
traces =  load_file('data/testresults.trans.txt.trace')

decoder = Decoder('data/file.en.lm', 'data/phrase-table', 'data/dm_fe_0.75')

In [43]:
def get_lm_cost(phrase, hist, LM):
    lm_cost = 0
    
    
    return lm_cost


def get_translation_cost(src_phrase, trace, decoder):

    MISS_COST = np.log10(0.001)
    PHRASE_PENALTY = -1
    HIST_SIZE = 4

    total_cost = 0
    
    
    trace = decoder.parse_trace(trace)
    src_words = src_phrase.split()

    src_phrases = [] 
    for p in trace:
        src_phrase = " ".join(src_words[p[0]:p[1] + 1])
        src_phrases.append(src_phrase)  

    
    
    
    for i in range(len(trace)):
        
        #-----------------------------TM COST-------------------------------------
        tm_cost = 0
        
        pair = (src_phrases[i], trace[i][2])
        
        if pair in decoder.TM:   
            [p_fe, lex_fe, p_ef, lex_ef, wp] = decoder.TM[pair]
                
            #h_TM = p(f|e) + p(e|f) (phrase translation)
            tm_cost += 1.0 * (np.log10(p_fe) + np.log10(p_ef))
                
            #h_TM = l(f|e) + l(e|f) (lexical phrase translation)
            tm_cost += 1.0 * (np.log10(lex_fe) + np.log10(lex_ef))

            #h_wp (word penalty)
            tm_cost += 1.0 * wp
        else: 
            tm_cost += MISS_COST
    
        #---------------------------------------------------------------------
        
        
        
        #-----------------------------LM COST-------------------------------------
        
        #Find the history of the target phrase, up to (n-1) words ('n' for maximum n-gram)
        trg_hist = []
        hist_count = 0
        j = i-1
        while j >= 0:
            prev_trg_words = trace[j][2].split()
            for trg_word in reversed(prev_trg_words):
                trg_hist.insert(0, trg_word)
                hist_count += 1
                
                if hist_count == HIST_SIZE:
                    break;
                
            if hist_count == HIST_SIZE:
                break;
            j -= 1
            
        if hist_count < HIST_SIZE:
            trg_hist.insert(0, '<s>')
            
        trg_hist = " ".join(trg_hist)
            
        lm_cost = get_lm_cost(trace[i][2], trg_hist)
        
        #------------------------------------------------------------------------------
        
        #Distorion>
        
        #Reordering pentaly
        rm_cost = 0
        if i > 0:
            
            #continuous
            if trace[i][0] == trace[i-1][1] + 1:
                rm_cost += decoder.RM[pair][3]
                
            elif trace[i][1] == trace[i-1][0] - 1:
                rm_cost += decoder.RM[pair][4]
                
            else:
                rm_cost += decoder.RM[pair][5]
            
            
            
            
            
        total_cost += tm_cost + lm_cost + PHRASE_PENALTY
    
         
    return total_cost


for i in range(10):
    cost = get_translation_cost(src_file[i], traces[i], decoder)
    print(i, cost)
    print
    print

['the', 'working', 'conditions', 'for', 'junior', 'doctors', 'same', 'a', 'horrorgeschichte', 'from', 'a', 'dickens', '-', 'a', 'novel', 'of', 'neunzehnten', 'century', '.']
0 -8.56695973065
['it', 'is', 'right', 'curious', 'that', 'we', 'entering', 'the', '21st', 'century', 'still', 'for', 'reasonable', 'arrangements', 'relating', 'to', 'the', 'health', 'and', 'arbeitsschutz', 'struggles', '.']
1 -34.8659799912
['i', 'urge', 'all', 'governments', 'including', 'irish', ',', 'the', 'zusatzbestimmungen', 'the', 'renewal', 'of', 'neunjahreszeitraums', 'to', 'implement', 'the', 'directive', 'in', 'claim', 'and', 'these', 'provisions', 'as', 'derogation', 'intended', 'are', 'only', 'then', 'be', 'applied', 'if', 'everything', 'for', 'implementing', 'the', 'directive', 'was', 'done', '.']
2 -60.3458553548
['mr', 'president', ',', 'the', 'draft', 'directive', 'on', 'the', 'working', 'time', 'for', 'certain', 'besch\xc3\xa4ftigtenkategorien', 'as', 'junior', 'doctors', ',', 'offshore', 'worker

In [15]:
for line in traces:
    trace = decoder.parse_trace(line)
    
    for pos in range(len(trace) - 1):
        
        if trace[pos][1] + 1 != trace[pos + 1][0]:
            print(pos, trace)
            break
        
     

6 [[0, 1, 'it'], [2, 3, 'to a'], [4, 4, 'rapid'], [5, 5, 'decision'], [6, 7, 'and a'], [8, 8, 'clear'], [9, 9, 'arbeitszeitreduzierung'], [11, 11, 'must'], [10, 10, 'lead'], [12, 12, '.']]
4 [[0, 2, 'we are not'], [3, 3, 'very'], [4, 5, 'proud to'], [6, 7, 'the outcome'], [8, 11, ', but i would'], [13, 13, 'nevertheless'], [14, 14, 'welcome'], [12, 12, 'it'], [15, 17, 'because we'], [18, 19, 'simply'], [20, 21, 'the end'], [22, 22, 'come'], [23, 24, '.']]
8 [[0, 4, 'i feel i must'], [5, 5, '-'], [6, 6, 'if'], [7, 10, 'i am not mistaken'], [11, 11, '-'], [12, 12, 'a'], [13, 13, 'comment'], [14, 15, 'correct ,'], [16, 18, 'mrs smet'], [20, 21, 'has made'], [19, 19, 'earlier'], [22, 22, ':'], [23, 26, 'it is not the'], [27, 28, 'first successful'], [29, 29, 'conciliation'], [30, 33, 'between the european parliament'], [34, 37, 'and the commission in'], [38, 39, 'social area ,'], [40, 40, 'but'], [41, 42, 'the second'], [43, 45, ', because'], [46, 47, 'the less'], [48, 48, 'important'], [4

In [8]:
numpy.log10(0)


divide by zero encountered in log10



-inf

In [38]:
np.log10(0.001)

-4.0

In [53]:
a = [1,2]
a.insert(0,123)
a

[123, 1, 2]