In [55]:
import numpy
import time
import pickle
import operator
from __future__ import print_function
from IPython.display import display
import matplotlib.pyplot as plt
import plotly.plotly as py
%matplotlib inline  

In [1]:
def load_file(filepath):
    """Reads a file into a list of phrases. Each phrase in the file must be separated with a new line character '\n' 
    Args:
        filepath (str): the relevant filepath of the file
    Returns:
        phrases (list(str)): A list with all the phrases 
    """
    with open(filepath, 'r') as f:
        phrases = f.readlines()
    return phrases

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


In [44]:
class Decoder:
    def __init__(self, trg_lm_file, tm_file, rm_file):
        
        self.trg_LM = self.parse_lm_file(trg_lm_file)
        self.TM = self.parse_tm_file(tm_file)
        self.RM = self.parse_rm_file(rm_file)
        
        self.distortion_limit = 3
        
        
    
    def parse_lm_file(self, filepath):
        lm_file = load_file(filepath)

        LM = {}

        for line in lm_file:
            tokens = line.split()
            if len(tokens) > 0:
                if is_number(tokens[0]):

                    p1 = float(tokens[0])

                    s = tokens[1:]

                    if is_number(s[-1]):
                        p2 = float(s[-1])
                        del s[-1]
                        LM[" ".join(s)] = [p1,p2]
                    else:    
                        LM[" ".join(s)] = [p1]

        return LM


    def parse_tm_file(self, filepath):
        tm_file = load_file(filepath)

        TM = {}

        for line in tm_file:
            tokens = line.split("|||")

            f = tokens[0].strip()
            e = tokens[1].strip()

            tokens = tokens[2].split()


            p_fe = float(tokens[0])
            l_fe = float(tokens[1])
            p_ef = float(tokens[2])
            l_ef = float(tokens[3])
            wp = float(tokens[4])

            TM[(f,e)] = [p_fe, l_fe, p_ef, l_ef, wp]

        return TM



    def parse_rm_file(self, filepath):
        re_file = load_file(filepath)

        reorderings = {}

        for line in re_file:
            tokens = line.split("|||")

            f = tokens[0].strip()
            e = tokens[1].strip()

            tokens = tokens[2].split()

            m_rl = float(tokens[0])
            s_rl = float(tokens[1])
            d_rl = float(tokens[2])

            m_lr = float(tokens[3])
            s_lr = float(tokens[4])
            d_lr = float(tokens[5])

            reorderings[(f,e)] = [m_rl, s_rl, d_rl, m_lr, s_lr, d_lr]

        return reorderings
    
    
    def parse_trace(self, trace_line):
        
        tokens = trace_line.split("|||")
        
        trace = []
        for token in tokens: 
            [src_positions, translation] = token.split(':',1) 
            
            src_positions = [int(pos) for pos in src_positions.split("-")]
            
            trace.append([src_positions[0], src_positions[1], translation.strip()])
        
        return trace
    
    
    def translation_cost(self, src_phrase, trace):
        
        trace = self.parse_trace(trace)
        src_words = src_phrase.split()
        
        
        src_phrases = []
        for p in trace:
            src_phrase = " ".join(src_words[p[0]:p[1] + 1])
            src_phrases.append(src_phrase)
            
        
        
        
        initial_State = State(null, -1, '')
        
        
        cost = 0
        translation = []
        
        trg_phrases = [p[2] for p in trace]
        
        
        return translation, cost, zip(source_phrases, trg_phrases)
    
    
    def explore(self, state, source_phrase, trace):
        
        next_states = []
        
        coverage_vector = state.coverage_vector
        
        untranslated_words = [pos for pos in range(len(coverage_vector)) if coverage_vector[pos] == 0]
        
        
        
        return next_states
        
    
    

In [45]:

    
class State:
        
    id_counter = 0

    def __init__(self, previous_state, source_phrase, translated_phrase):
        
        State.id_counter += 1
        
        self.id = State.id_counter
        
        self.previous_state = previous_state
        
        self.recombined_states = []
        
        self.history = []
        
        self.prob = []
        
        self.coverage_vector = []
        
        self.next_states = []
        
        
        

In [46]:
src_file = load_file('data/file.test.de')
trg_file = load_file('data/file.test.en') 
traces =  load_file('data/testresults.trans.txt.trace')

decoder = Decoder('data/file.en.lm', 'data/phrase-table', 'data/dm_fe_0.75')

In [53]:
def translation_cost(src_phrase, trace):
        
        trace = decoder.parse_trace(trace)
        src_words = src_phrase.split()
        
        
        src_phrases = []
        for p in trace:
            src_phrase = " ".join(src_words[p[0]:p[1] + 1])
            src_phrases.append(src_phrase)
            
        
        
        cost = 0
        translation = []
        
        trg_phrases = [p[2] for p in trace]
        
        
        return translation, cost, zip(src_phrases, trg_phrases)


for i in range(10):
    translation, cost, s = translation_cost(src_file[i], traces[i])
    print(s)
    print
    print

[('die', 'the'), ('arbeitsbedingungen', 'working conditions'), ('f\xc3\xbcr \xc3\xa4rzte in der ausbildung', 'for junior doctors'), ('gleichen', 'same'), ('einer', 'a'), ('horrorgeschichte', 'horrorgeschichte'), ('aus einem', 'from a'), ('dickens', 'dickens'), ('-', '-'), ('roman', 'a novel'), ('des', 'of'), ('neunzehnten', 'neunzehnten'), ('jahrhunderts .', 'century .')]


[('es ist schon', 'it is'), ('recht', 'right'), ('merkw\xc3\xbcrdig , da\xc3\x9f wir', 'curious that we'), ('an der schwelle', 'entering the'), ('des 21', '21st'), ('. jahrhunderts', 'century'), ('noch immer', 'still'), ('f\xc3\xbcr', 'for'), ('vern\xc3\xbcnftige', 'reasonable'), ('regelungen in bezug auf', 'arrangements relating to'), ('den', 'the'), ('gesundheits - und', 'health and'), ('arbeitsschutz', 'arbeitsschutz'), ('k\xc3\xa4mpfen', 'struggles'), ('m\xc3\xbcssen .', '.')]


[('ich appelliere an', 'i urge'), ('alle regierungen', 'all governments'), ('einschlie\xc3\x9flich der', 'including'), ('irischen', 'ir

In [36]:
a = [1,2,3,4]

a[0:1]

[1]

In [42]:
decoder.translation_cost = Decoder.translation_cost

In [73]:
def test(l):
    x = numpy.array(l)
    return numpy.where(x == 0)[0]
    
    
def test2(l):
#     return [i for i in range(len(l)) if l[i] == 0]
    return [i for i,v in enumerate(l) if v == 0]



def test3(x):
    return numpy.where(x == 0)[0]
    
l = [1,0,2,0,3,0,4,5,0,0,8]
l_ar = numpy.array(l)

%timeit test(l)
%timeit test2(l)
%timeit test3(l_ar)

print(test(l))
print(test2(l))
print(test3(l_ar))



The slowest run took 14.63 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.7 µs per loop
1000000 loops, best of 3: 983 ns per loop
The slowest run took 11.00 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 1.36 µs per loop
[1 3 5 8 9]
[1, 3, 5, 8, 9]
[1 3 5 8 9]
