In [1]:
import numpy
import time
import pickle
from __future__ import print_function

In [2]:

def load_file(filepath):
    
    with open(filepath, 'r') as f:
        file = f.readlines()
    return file

src_file = load_file('data/file.de')
trg_file = load_file('data/file.en')
aligned_file = load_file('data/file.aligned')

In [42]:
def extract(trg_start, trg_end, src_start, src_end, alignments, trg_aligned, trg_len):    

    if trg_end < 0 :
        return [], []
    
    A = []
    
    a = []
    
    for trg_word, src_word  in alignments:
        if trg_start <= trg_word <= trg_end:
            
            if (src_word < src_start or src_word > src_end):  
                return [], []
            else:
                
                a.append( (trg_word - trg_start, src_word - src_start) )

    E = []
    trg_s = trg_start
    
    while True:
        trg_e = trg_end
        while True:
            #E.append( (tuple(trg_words[trg_s:trg_e+1]) , tuple(src_words[src_start:src_end+1])) )
            E.append( (" ".join(trg_words[trg_s:trg_e+1]) , " ".join(src_words[src_start:src_end+1]) ))
            
            A.append(a)
             
            trg_e += 1
            
            if trg_e in trg_aligned or trg_e == len(trg_words):
                break 

        trg_s -= 1
        
        if trg_s in trg_aligned or trg_s < 0:
            break
       
        a = [(i[0]+1,i[1]) for i in a]
        
    return E, A


def extract_phrases(src_words, trg_words, alignments, cutoff):
    
    
    if cutoff == -1:
        cutoff = len(src_words)
        
        
    trg_aligned = set()
    for (trg,_) in alignments:
        trg_aligned.add(trg)
    
    
    extracted_phrases = []
    extracted_alignments = []
    for src_start in range(len(src_words)):
        for src_end in range(src_start, min(src_start + cutoff, len(src_words))):

            trg_start = len(trg_words) - 1
            trg_end = - 1
            for  (trg,src) in alignments: 
                if src_start <= src <= src_end:
                    trg_start = min(trg, trg_start)
                    trg_end = max(trg, trg_end)
                    
            
            if(trg_end - trg_start > cutoff - 1):
                continue

            phrase_pairs, A = extract(trg_start, trg_end, src_start, src_end, alignments, trg_aligned, len(trg_words))
            
            if (phrase_pairs):
                extracted_phrases.extend(phrase_pairs)
                for a in A:
                    extracted_alignments.append(a);
            
    return extracted_phrases, extracted_alignments

In [43]:
def add_freq_dict(dictionary, item):
    if item in dictionary:
        dictionary[item] += 1
    else:
        dictionary[item] = 1

        

freq_pairs = dict()
freq_src = dict()
freq_trg = dict()
    
start_time = time.time()
    
for i in range(len(src_file)):
    src_words = src_file[i].split()
    trg_words = trg_file[i].split()        
    alignments = [[int(a) for a in alignment.split('-')] for alignment in aligned_file[i].split()]
    alignments = [(al[1], al[0]) for al in alignments]
    
    
    phrase_pairs, extracted_alignments = extract_phrases(src_words, trg_words, alignments, 5)
     
    
    for j, pair in enumerate(phrase_pairs):
        
        a = extracted_alignments[j]
        
        if pair in freq_pairs:
            freq_pairs[pair][0] += 1;
            freq_pairs[pair][1].add(tuple(a));
        else:
             freq_pairs[pair] = [1, set([tuple(a)])]
        
        
        add_freq_dict(freq_trg, pair[0])
        add_freq_dict(freq_src, pair[1])
        
    if (i % (len(src_file) / 100) == 0):
        print(' \r%d / %d (%ds)'%(i+1,len(src_file), time.time() - start_time), end = '')

print()
print('Done!!!')
print('Total duration: %ds'%(time.time() - start_time))

print("#unique en phrases: %d"%(len(freq_trg)))
print("#unique de phrases: %d"%(len(freq_src)))
print("#unique (en,de) phrases: %d"%(len(freq_pairs)))

49501 / 50000 (56s)                                                                                               
Done!!!
Total duration: 56s
#unique en phrases: 1363321
#unique de phrases: 1264713
#unique (en,de) phrases: 2691953


In [45]:
for key in freq_pairs.keys()[0:100]:
    if(len(freq_pairs[key][1]) > 1):
        print(key, freq_pairs[key])

('mr president', 'herr pr\xc3\xa4sident') [1389, set([((0, 0), (1, 0), (1, 1)), ((0, 0), (1, 1))])]


### Load/save dictionaries

In [None]:
#Save files
# with open("data/freq_src_small", 'wb') as file:
#     pickle.dump(freq_src, file)

# with open("data/freq_trg_small", 'wb') as file:
#     pickle.dump(freq_trg, file)

# with open("data/freq_pairs_small", 'wb') as file:
#     pickle.dump(freq_pairs, file)

with open("data/joined_freq_src", 'rb') as file:
    freq_src = pickle.load(file)

with open("data/joined_freq_trg", 'rb') as file:
    freq_trg = pickle.load(file)

with open("data/joined_freq_pairs", 'rb') as file:
    freq_pairs = pickle.load(file)
        
print("#en phrases: %d"%(len(freq_src)))
print("#de phrases: %d"%(len(freq_trg)))
print("#(en,de) phrases: %d"%(len(freq_pairs)))     

In [None]:
# import operator
# sorted_freq_src = sorted(freq_src.items(), key=operator.itemgetter(1), reverse = True)
# sorted_freq_trg = sorted(freq_trg.items(), key=operator.itemgetter(1), reverse = True)
# sorted_freq_pairs = sorted(freq_pairs.items(), key=operator.itemgetter(1), reverse = True)

## Calculate p(e|f) and p(f|e)

In [91]:
for trg_src_pair, value in freq_pairs.iteritems():
    freq = value[0]
    
    p_src_trg = 1.0 * freq / freq_trg[trg_src_pair[0]]
    p_trg_src = 1.0 * freq / freq_src[trg_src_pair[1]]
    
    freq_pairs[trg_src_pair] = [freq, value[1], p_src_trg, p_trg_src]
    

In [92]:
for key in freq_pairs.keys()[0:10]:
    print(key, freq_pairs[key])

('do we believe', 'wir') [1, set([((1, 0),)]), 0.2, 6.532105297537396e-05]
(', mr langen ,', ', langen ,') [1, set([((0, 0), (2, 1), (3, 2))]), 0.2, 1.0]
('parliament and in the council', 'parlament als auch im rat') [1, set([((0, 0), (1, 1), (1, 2), (2, 3), (3, 3), (4, 4))]), 1.0, 0.5]
('to raise', 'zu einem thema zu machen') [1, set([((0, 3), (1, 2))]), 0.019230769230769232, 1.0]
('the various stages of close', 'auf die verschiedenen etappen verlangen') [1, set([((0, 1), (1, 2), (2, 3))]), 0.2, 0.3333333333333333]
('mr kouchner', 'da\xc3\x9f herrn kouchner') [1, set([((0, 1), (1, 0), (1, 2))]), 0.041666666666666664, 1.0]
('you know very well', 'bekanntlich ergibt sich') [1, set([((0, 0), (1, 0), (2, 1), (3, 0))]), 0.25, 1.0]
('article 87 ( 2 )', 'artikel 87 absatz 2') [2, set([((0, 0), (1, 1), (2, 2), (3, 3), (4, 2))]), 1.0, 0.6666666666666666]
('900', 'mittelzuweisungen ( 900') [1, set([((0, 0), (0, 1), (0, 2))]), 0.3333333333333333, 1.0]
('a newspaper article which appeared', 'in')

### KMO

In [84]:
# c = 1
# start_time = time.time()

# for trg_src_pair, value in freq_pairs.iteritems():
#     pair_freq = value[0]
#     trg_words = trg_src_pair[0].split()
#     src_words = trg_src_pair[1].split()
    
#     possible_alignments = value[1]
    
#     lex = -1;
    
#     for a in possible_alignments:
    
#         mat = len(trg_words) * [None]

#         for (trg_pos, src_pos) in a:

#             src_word = src_words[src_pos]
#             trg_word = trg_words[trg_pos]

#             if mat[trg_pos] is None:
#                 mat[trg_pos] = []

#             if (trg_word, src_word) in freq_pairs:
#                 w = freq_pairs[(trg_word, src_word)][3] # w(e|f)

#                 mat[trg_pos].append(w)

#             else:
#                 mat[trg_pos].append(0)

#         for i in range(len(mat)):
#             if mat[i] is None:
#                 mat[i] = 1
#             else:
#                 mat[i] = numpy.mean(mat[i])

#     #     lex = numpy.sum(numpy.log(mat))
#         lex = max(lex, numpy.product(mat))
    
#     freq_pairs[trg_src_pair].append(lex)
        
#     if  c % (len(freq_pairs) / 100) == 0: 
#         print(' \r%d / %d (%d%%) (%ds)'%(c,len(freq_pairs), 100 * c / len(freq_pairs),time.time() - start_time), end = '')
#     c +=1
        
# print('\nDone!!')
    

2691900 / 2691953 (99%) (109s)\Done!!                                                                                       


In [85]:
for key in freq_pairs.keys()[0:10]:
    print(key, freq_pairs[key])

('do we believe', 'wir') [1, set([((1, 0),)]), 0.2, 6.532105297537396e-05, 0.67777124567248026]
(', mr langen ,', ', langen ,') [1, set([((0, 0), (2, 1), (3, 2))]), 0.2, 1.0, 0.055294363579737778]
('parliament and in the council', 'parlament als auch im rat') [1, set([((0, 0), (1, 1), (1, 2), (2, 3), (3, 3), (4, 4))]), 1.0, 0.5, 0.00032486683238314493]
('to raise', 'zu einem thema zu machen') [1, set([((0, 3), (1, 2))]), 0.019230769230769232, 1.0, 0.0019659398530738762]
('the various stages of close', 'auf die verschiedenen etappen verlangen') [1, set([((0, 1), (1, 2), (2, 3))]), 0.2, 0.3333333333333333, 0.066741572848568728]
('mr kouchner', 'da\xc3\x9f herrn kouchner') [1, set([((0, 1), (1, 0), (1, 2))]), 0.041666666666666664, 1.0, 0.19401330376940132]
('you know very well', 'bekanntlich ergibt sich') [1, set([((0, 0), (1, 0), (2, 1), (3, 0))]), 0.25, 1.0, 0.0]
('article 87 ( 2 )', 'artikel 87 absatz 2') [2, set([((0, 0), (1, 1), (2, 2), (3, 3), (4, 2))]), 1.0, 0.6666666666666666, 0.0

In [93]:
def KMO(src_words, trg_words, possible_alignments, w_trg_src):
    
    lex = -1
    
    for a in possible_alignments:
    
        mat = len(trg_words) * [None]

        for (trg_pos, src_pos) in a:

            src_word = src_words[src_pos]
            trg_word = trg_words[trg_pos]

            if mat[trg_pos] is None:
                mat[trg_pos] = []

            if (trg_word, src_word) in w_trg_src:
                w = w_trg_src[(trg_word, src_word)] # w(e|f)

                mat[trg_pos].append(w)

            else:
                mat[trg_pos].append(0)

        for i in range(len(mat)):
            if mat[i] is None:
                mat[i] = 1
            else:
                mat[i] = numpy.mean(mat[i])

    #     lex = numpy.sum(numpy.log(mat))
        lex = max(lex, numpy.product(mat))
    
    return lex;


c = 1
start_time = time.time()

for trg_src_pair, value in freq_pairs.iteritems():
    
    #calculate p(e|f)
    trg_words = trg_src_pair[0].split()
    src_words = trg_src_pair[1].split()
    possible_alignments = value[1]
    w_trg_src = {}
    for a in possible_alignments:
        for (trg_pos, src_pos) in a:
            pair = (trg_words[trg_pos], src_words[src_pos])
            if pair in freq_pairs:
                w_trg_src[pair] = freq_pairs[pair][3]
                
                
    lex_trg_src = KMO(src_words, trg_words, possible_alignments, w_trg_src);
    
    #cacluate p(f|e)
    trg_words , src_words = src_words , trg_words

    #reverse the alignments from (trg,src) to (src,trg)
    possible_alignments2 = set()
    for a in possible_alignments:
        a2 = tuple([(al[1], al[0]) for al in a])
        possible_alignments2.add(a2)
        
    w_src_trg = {}
    for a in possible_alignments2:
        for (trg_pos, src_pos) in a:
            inv_pair = (src_words[src_pos], trg_words[trg_pos])
            if inv_pair in freq_pairs:
                w_src_trg[(inv_pair[1], inv_pair[0])] = freq_pairs[inv_pair][2]
    
    lex_src_trg = KMO(src_words, trg_words, possible_alignments2, w_src_trg)
    
    
    
    freq_pairs[trg_src_pair].append(lex_src_trg)
    freq_pairs[trg_src_pair].append(lex_trg_src)
        
    if  c % (len(freq_pairs) / 100) == 0: 
        print(' \r%d / %d (%d%%) (%ds)'%(c,len(freq_pairs), 100 * c / len(freq_pairs),time.time() - start_time), end = '')
    c +=1
        


print('\nDone!!')
    

2691900 / 2691953 (99%) (216s)                                                                                              
Done!!


In [94]:
for key in freq_pairs.keys()[0:10]:
    print(key, freq_pairs[key])

('do we believe', 'wir') [1, set([((1, 0),)]), 0.2, 6.532105297537396e-05, 0.60809939635468557, 0.67777124567248026]
(', mr langen ,', ', langen ,') [1, set([((0, 0), (2, 1), (3, 2))]), 0.2, 1.0, 0.27708848798290681, 0.055294363579737778]
('parliament and in the council', 'parlament als auch im rat') [1, set([((0, 0), (1, 1), (1, 2), (2, 3), (3, 3), (4, 4))]), 1.0, 0.5, 9.0464324715546004e-08, 0.00032486683238314493]
('to raise', 'zu einem thema zu machen') [1, set([((0, 3), (1, 2))]), 0.019230769230769232, 1.0, 0.0066933871793610801, 0.0019659398530738762]
('the various stages of close', 'auf die verschiedenen etappen verlangen') [1, set([((0, 1), (1, 2), (2, 3))]), 0.2, 0.3333333333333333, 0.011257902576663648, 0.066741572848568728]
('mr kouchner', 'da\xc3\x9f herrn kouchner') [1, set([((0, 1), (1, 0), (1, 2))]), 0.041666666666666664, 1.0, 0.0, 0.19401330376940132]
('you know very well', 'bekanntlich ergibt sich') [1, set([((0, 0), (1, 0), (2, 1), (3, 0))]), 0.25, 1.0, 6.036764154177

In [97]:
freq_pairs[('we', 'wir')]

[10376,
 {((0, 0),)},
 0.6080993963546856,
 0.6777712456724803,
 0.60809939635468557,
 0.67777124567248026]

In [2]:

# with open("data/complete/freq_pair", 'wb') as file:
#     pickle.dump(freq_pairs,file)
    
# with open("data/complete/freq_src", 'wb') as file:
#     pickle.dump(freq_src,file)
    
# with open("data/complete/freq_trg", 'wb') as file:
#     pickle.dump(freq_trg,file)



with open("data/complete/freq_pairs", 'rb') as file:
    freq_pairs = pickle.load(file)
    
with open("data/complete/freq_src", 'rb') as file:
    freq_src = pickle.load(file)
    
with open("data/complete/freq_trg", 'rb') as file:
    freq_trg = pickle.load(file)
    

In [3]:
freq_pairs[('you','bekanntlich')]

[2,
 {((0, 0),)},
 0.0006038647342995169,
 0.009523809523809525,
 0.00060386473429951688,
 0.0095238095238095247]

In [4]:
freq_pairs[('know','bekanntlich')]

[2,
 {((0, 0),)},
 0.0024875621890547263,
 0.009523809523809525,
 0.0024875621890547263,
 0.0095238095238095247]

In [5]:
freq_pairs[('well','bekanntlich')]

KeyError: ('well', 'bekanntlich')

In [6]:
freq_pairs[('very','ergibt')]

[1,
 {((0, 0),)},
 0.0005858230814294083,
 0.03571428571428571,
 0.00058582308142940832,
 0.035714285714285712]

In [106]:
 0.0005858230814294083 * ( 0.0006038647342995169 +  0.0024875621890547263) / 3

6.036764154177393e-07

In [105]:
6.0367641541773932e-07

6.036764154177393e-07