In [2]:
import numpy
import time
import pickle
import operator
from __future__ import print_function
from IPython.display import display
import matplotlib.pyplot as plt
import plotly.plotly as py
%matplotlib inline  

In [11]:
def load_file(filepath):
    """Reads a file into a list of phrases. Each phrase in the file must be separated with a new line character '\n' 
    Args:
        filepath (str): the relevant filepath of the file
    Returns:
        phrases (list(str)): A list with all the phrases 
    """
    with open(filepath, 'r') as f:
        phrases = f.readlines()
    return phrases

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False


def parse_lm_file(filepath):
    lm_file = load_file(filepath)
    
    LM = {}
    
    for line in lm_file:
        tokens = line.split()
        if len(tokens) > 0:
            if is_number(tokens[0]):
                
                p1 = float(tokens[0])
                
                s = tokens[1:]
                
                if is_number(s[-1]):
                    p2 = float(s[-1])
                    del s[-1]
                    LM[" ".join(s)] = [p1,p2]
                else:    
                    LM[" ".join(s)] = [p1]
                    
    return LM
                     
        
        

def parse_tm_file(filepath):
    tm_file = load_file(filepath)
    
    TM = {}
    
    for line in tm_file:
        tokens = line.split("|||")
        
        f = tokens[0].strip()
        e = tokens[1].strip()
        
        tokens = tokens[2].split()
            
            
        p_fe = float(tokens[0])
        l_fe = float(tokens[1])
        p_ef = float(tokens[2])
        l_ef = float(tokens[3])
        wp = float(tokens[4])

        TM[(f,e)] = [p_fe, l_fe, p_ef, l_ef, wp]
        
    return TM



def parse_reordering_file(filepath):
    re_file = load_file(filepath)
    
    reorderings = {}
    
    for line in re_file:
        tokens = line.split("|||")
        
        f = tokens[0].strip()
        e = tokens[1].strip()
        
        tokens = tokens[2].split()
        
        m_rl = float(tokens[0])
        s_rl = float(tokens[1])
        d_rl = float(tokens[2])
        
        m_lr = float(tokens[3])
        s_lr = float(tokens[4])
        d_lr = float(tokens[5])
        
        
        reorderings[(f,e)] = [m_rl, s_rl, d_rl, m_lr, s_lr, d_lr]
        
    return reorderings







src_file = load_file('data/file.test.de')
trg_file = load_file('data/file.test.en') 

trg_lm = parse_lm_file('data/file.en.lm')
tm = parse_tm_file('data/phrase-table')
rm = parse_reordering_file('data/dm_fe_0.75')

test_results_trace_file = load_file('data/testresults.trans.txt.trace')


In [15]:
for key in rm.keys()[:10]:
    print key, rm[key]

('unternehmen , die wagniskapital f\xc3\xbcr die', 'from the risks of putting capital into') [0.30874, 0.00234, 0.68893, 0.90472, 0.00483, 0.09045]
(', verlassen', ',') [0.79583, 0.01035, 0.19382, 0.90472, 0.00483, 0.09045]
('hat vielleicht schon ein', 'confidence already') [0.0683, 0.00315, 0.92855, 0.0683, 0.00315, 0.92855]
('sein , d.h. er mu\xc3\x9f', ', regulated in such a way as') [0.93786, 0.00315, 0.05899, 0.05818, 0.00268, 0.93914]
('wir normalerweise um viertel', ', and we therefore') [0.03653, 0.00168, 0.96178, 0.2244, 0.01035, 0.76525]
('da\xc3\x9f die weitere beseitigung der " hindernisse', 'that fewer obstacles') [0.9539, 0.00234, 0.04377, 0.21541, 0.00115, 0.78344]
('der die europ\xc3\xa4ische union', 'which the european union') [0.92478, 0.00381, 0.07141, 0.87008, 0.00658, 0.12334]
('davor , den mut zu verlieren', 'since matters are improving') [0.10472, 0.80483, 0.09045, 0.10472, 0.00483, 0.89045]
('auf bosnier', 'on bosnians') [0.9539, 0.00234, 0.04377, 0.87008, 0.006