In [1]:
from tqdm import tqdm

In [2]:
# Read english sentences
with open('corpus.en', 'r') as f:
    english_corpus = f.readlines()
    english_corpus = [i[:-1] for i in english_corpus]
    english_corpus = [i.split(' ') for i in english_corpus]
    
# Read Spanish sentences
with open('corpus.es', 'r') as f:
    spanish_corpus = f.readlines()
    spanish_corpus = [i[:-1] for i in spanish_corpus]
    spanish_corpus = [i.split(' ') for i in spanish_corpus]

In [3]:
# Initialize parameters

# english_vacab
english_vacab = {}
for sent in tqdm(english_corpus):
    for word in sent:
        english_vacab[word] = 0
english_vacab['NULL'] = 0

        
# get the counts n(e)
spanish_vocab = []
for word in tqdm(english_vacab):
    c = []
    for idx in range(len(english_corpus)):
        if word in english_corpus[idx]:
            c += spanish_corpus[idx]
    spanish_vocab += c
    english_vacab[word] = len(set(c))
english_vacab['NULL'] = len(set(spanish_vocab))
spanish_vocab = set(spanish_vocab)

# compute t and cef
cef = {}
for e_word in tqdm(english_vacab):
    temp1 = {}
    for f_word in spanish_vocab:
        temp1[f_word] = 0
    cef[e_word] = temp1
    

# compute ce
ce = {}
for e_word in tqdm(english_vacab):
    ce[e_word] = 0

100%|██████████| 5401/5401 [00:00<00:00, 227794.06it/s]
100%|██████████| 8797/8797 [00:18<00:00, 468.13it/s]
100%|██████████| 8797/8797 [00:16<00:00, 548.45it/s]
100%|██████████| 8797/8797 [00:00<00:00, 2289624.09it/s]


In [4]:
import json
with open('t_dump', 'r') as day:
    t = json.load(day)

In [5]:
for i in range(len(english_corpus)):
    english_corpus[i] = ['NULL'] + english_corpus[i]

In [6]:
# compute l-(j)-english and m(i)
# q[l][m][(i, j)]

q = {}
c_jilm = {}
c_ilm = {}

for idx in range(len(english_corpus)):

    l = len(english_corpus[idx])
    m = len(spanish_corpus[idx])
    
    
    if l not in q:
        q[l] = dict()
        if m not in q[l]:
            q[l][m] = dict()
    else:
        if m not in q[l]:
            q[l][m] = dict()
        
    if l not in c_jilm:
        c_jilm[l] = dict()
        if m not in c_jilm[l]:
            c_jilm[l][m] = dict()
    else:
        if m not in c_jilm[l]:
            c_jilm[l][m] = dict()
    
    if l not in c_ilm:
        c_ilm[l] = dict()
        if m not in c_ilm[l]:
            c_ilm[l][m] = dict()
    else:
        if m not in c_ilm[l]:
            c_ilm[l][m] = dict()
        
    
    for i in range(m):
        for j in range(l):
            if (i, j) not in q[l][m]:
                q[l][m][(i, j)] = 1/(l+1)
            if (i, j) not in c_jilm[l][m]:
                c_jilm[l][m][(i, j)] = 0 
        if i not in c_ilm[l][m]:
            c_ilm[l][m][i] = 0


In [7]:
# Initialization for IBM model2

for iteration in tqdm(range(5)):
    
    # set counts to zero
    for q1 in cef:
        for q2 in cef[q1]:
            cef[q1][q2] = 0
            
    for q1 in ce:
        ce[q1] = 0
        
    for ll in c_jilm:
        for mm in c_jilm[ll]:
            for ij in c_jilm[ll][mm]:
                c_jilm[ll][mm][ij] = 0
                
    for ll in c_ilm:
        for mm in c_ilm[ll]:
            for ij in c_ilm[ll][mm]:
                c_ilm[ll][mm][ij] = 0
                
    
    # iterate over corpus
    for k in range(len(english_corpus)):
        m = len(spanish_corpus[k])
        l = len(english_corpus[k])
        for i in range(len(spanish_corpus[k])):
            
            denom = sum([t[english_corpus[k][j]][spanish_corpus[k][i]]*q[l][m][(i, j)] for j in range(len(english_corpus[k]))])
            
            for j in range(len(english_corpus[k])):
                
                # Compute delta
                num = t[english_corpus[k][j]][spanish_corpus[k][i]] * q[l][m][(i, j)]
                delta = num/denom
                
                # Update
                cef[english_corpus[k][j]][spanish_corpus[k][i]] += delta
                ce[english_corpus[k][j]] += delta
                c_jilm[l][m][(i, j)] += delta
                c_ilm[l][m][i] += delta
            
                        
                
    # evelauate t
    for e_word in english_vacab:
        for f_word in spanish_vocab:
            t[e_word][f_word] = cef[e_word][f_word]/ce[e_word]

    for l in q:
        for m in q[l]:
            for ij in q[l][m]:
                q[l][m][ij] = c_jilm[l][m][ij]/c_ilm[l][m][ij[0]]

100%|██████████| 5/5 [23:24<00:00, 277.93s/it]


In [8]:
import json
with open('t_model2_dump', 'w') as f:
    json.dump(t, f)

In [1]:
import json
with open('t_model2_dump', 'r') as day:
    t = json.load(day)

In [2]:
t['i']

{'': 1.6036001753460843e-155,
 'someteré': 1.784037285255605e-21,
 'hatzidakis': 2.169073198850488e-34,
 'detalle': 1.4529985049858624e-27,
 'reclamar': 1.2306523079944066e-208,
 'delegar': 0.0,
 'naciones': 4.2427401381115735e-29,
 'nielson': 9.519107074840285e-68,
 'vascas': 5.51187698629715e-43,
 'salvados': 0.0,
 'excusarse': 3.1550358939130784e-190,
 'terminaré': 2.1456293772507464e-20,
 'cortapisas': 0.0,
 'aquellas': 5.492471778994327e-22,
 'carteles': 9.084328946186162e-30,
 'liberalizar': 0.0,
 'sustituidas': 0.0,
 'surgir': 0.0,
 'apuntala': 1.8758344995494817e-37,
 'titulares': 9.285639087528742e-30,
 'pecio': 0.0,
 'sancionados': 0.0,
 'dedicarse': 0.0,
 'conmocionado': 0.0,
 'koch': 1.2745822188370715e-31,
 'balance': 1.6602146770288067e-150,
 'nutrición': 0.0,
 'durísimas': 0.0,
 'finales': 1.4454186153805824e-25,
 'somete': 3.889461425992208e-24,
 'situaremos': 0.0,
 'metros': 0.0,
 'cordialmente': 7.958755202692645e-26,
 'parlamentarias': 0.0,
 'debe': 1.539449392527278

In [23]:
# Read Spanish dev sentences
with open('dev.es', 'r') as f:
    spanish_dev_corpus = f.readlines()
    spanish_dev_corpus = [i[:-1] for i in spanish_dev_corpus]
    spanish_dev_corpus = [i.split(' ') for i in spanish_dev_corpus]
    
# Read Spanish dev sentences
with open('dev.en', 'r') as f:
    english_dev_corpus = f.readlines()
    english_dev_corpus = [i[:-1] for i in english_dev_corpus]
    english_dev_corpus = [i.split(' ') for i in english_dev_corpus]

In [24]:
for i in range(len(english_dev_corpus)):
    english_dev_corpus[i] = ['NULL'] + english_dev_corpus[i]

In [25]:
# manipulate 'NULL' in t
for s_idx in tqdm(range(len(english_corpus))):
    for f_word in spanish_vocab:
        t['NULL'][f_word] = 0
        
for e_sent, f_sent in zip(english_dev_corpus, spanish_dev_corpus):
    m = len(f_sent)
    l = len(e_sent)
    for j, ew in enumerate(e_sent):
        for i, fw in enumerate(f_sent):
            if ew == 'NULL':
                q[l][m][(i,j)] = 0


In [26]:
result = []
for s_idx, sent in enumerate(spanish_dev_corpus):
    m = len(spanish_dev_corpus[s_idx])
    l = len(english_dev_corpus[s_idx])
    for fw_idx, f_word in enumerate(spanish_dev_corpus[s_idx]):
        best = 0
        index = 0
        for ew_idx, e_word in enumerate(english_dev_corpus[s_idx]):
            if q[l][m][(fw_idx, ew_idx)]*t[e_word][f_word] > best:
                best = q[l][m][(fw_idx, ew_idx)]*t[e_word][f_word]
                index = ew_idx
                
        result.append(str(s_idx+1)+" "+str(index+1)+" "+str(fw_idx+1)) 

In [27]:
with open('dev_result2.key', 'w') as f:
    for item in result:
        f.write("%s\n" % item)

In [28]:
!python3 eval_alignment.py dev.key dev_result2.key

      Type       Total   Precision      Recall     F1-Score
     total        5920     0.025        0.026        0.026
