# Project 1

### Initialise
---

#### import needed module

In [1]:
import numpy as np
from collections import Counter
from collections import defaultdict
from ipywidgets import *
from tqdm import tqdm_notebook, tqdm
from aer import *
import pickle
from copy import deepcopy
import os
from enum import Enum
from scipy.special import digamma
from random import random

import mmap

#### create supporting functions
---

In [2]:
# from: https://blog.nelsonliu.me/2016/07/29/progress-bars-for-python-file-reading-with-tqdm/
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

file_enc='utf-8'

In [3]:
def create_alignments_modelI(t, file_f, file_e, target, file_enc='utf-8'):
    # open file to write to
    with open(target,'w',encoding=file_enc) as tar:
        # for each sentence in list
        with open(file_f, encoding=file_enc) as ffil, open(file_e, encoding=file_enc) as efil:
            for line_num, (line_f,line_e) in enumerate(zip(ffil,efil)):
                f_sentence = line_f.split()
                e_sentence = line_e.split()
                # for each word in sentence, find the best alignment
                for ind_f,f in enumerate(f_sentence):
                    ind_f += 1 #0 is reserved for null
                    max_ind_e = 0 #when no alignment is found, align to zero
                    max_p = 0
                    for ind_e,e in enumerate(e_sentence):
                        ind_e += 1 #0 is reserved for null
                        if (f,e) in t:
                            if t[(f,e)] > max_p:
                                max_p = t[(f,e)]
                                max_ind_e = ind_e
                    # write to file. Output: sentence_line english_pos french_pos probability
                    tar.write('%d %d %d P %f\n'%(line_num,max_ind_e,ind_f,max_p)) 

In [4]:
def create_alignments_modelII(t, q, file_f, file_e, target, file_enc='utf-8'):
    # open file to write to
    with open(target,'w',encoding=file_enc) as tar:
        # for each sentence in list
        with open(file_f, encoding=file_enc) as ffil, open(file_e, encoding=file_enc) as efil:
            for line_num, (line_f,line_e) in enumerate(zip(ffil,efil)):
                f_sentence = line_f.split()
                e_sentence = line_e.split()
                
                # Get lengths
                l = len(e_sentence)
                m = len(f_sentence)
            
                # for each word position in sentence, find the best alignment
                for i in range(0, m): # french
                    max_p = 0
                    max_ind = 0 #when no alignment is found, align to zero
                    for j in range(0, l+1): # english
                        j-= 1
                        if j == -1:
                            e = '<NULL>'
                        else:    
                            e = e_sentence[j]
                        f = f_sentence[i]
                        
                        if (f,e) in t and (j,i,l,m) in q:
                            p = q[(j,i,l,m)]*t[(f,e)]

                            if p > max_p:
                                max_p = p
                                max_ind = j+1
                    
                    # write to file. Output: sentence_line english_pos french_pos probability
                    tar.write('%d %d %d P %f\n'%(line_num,max_ind,i+1,max_p)) 
                        

In [5]:
def create_pairs(file_f,file_e,null='<NULL>',file_enc='utf-8'):
    fe_pairs = dict()
    with open(file_f,encoding=file_enc) as f, open(file_e,encoding=file_enc) as e:
        for line_f, line_e in tqdm(zip(f,e),total=get_num_lines(file_f)):
            for word_f in line_f.split():
                fe_pairs[(word_f, null)] = 1
                for word_e in line_e.split():
                    fe_pairs[(word_f, word_e)] = 1
    return fe_pairs

In [6]:
def calculate_perplexity(t,file_f,file_e,null='<NULL>',file_enc='utf-8'):
    perplexity = 0.0
    with open(file_f,encoding=file_enc) as f, open(file_e,encoding=file_enc) as e:
        for line_f, line_e in tqdm(zip(f,e),total=get_num_lines(file_f)):
            sentence_f = line_f.split()
            sentence_e = line_e.split()
            sentence_e = [null] + sentence_e
            l = len(sentence_e)
            for f in sentence_f:
                tmp = 0.0
                for e in sentence_e:
                    t_fe = 0
                    if (f,e) in t:
                        t_fe = t[(f,e)]
                    tmp += t_fe/l
                perplexity += np.log(tmp)
    return perplexity

In [7]:
class IBM_model(Enum):
    I = 1
    II = 2
    
class Initialization_type(Enum):
    uniform_one = 1
    uniform_other = 2
    random = 3
    modelI = 4

# IBM 1
---

In [8]:
# Train
'''
E-step:
    for each word j in french sentence:
        the probability of fj|ei divided by (for t=0>m: fj|et)
        
M-step:
    E[fe]/E[e]
'''
def em_step_modelI(t, file_f, file_e, use_VB, alpha, file_enc='utf-8'):
    num_lines = get_num_lines(file_f)
    
    # Set to zero
    cooccurrences = defaultdict(float) # number of times words e and f happen together
    total_f = defaultdict(float) # number of times word f happens
    total_e = defaultdict(float) # number of times word e happens
    
    with open(file_f,encoding=file_enc) as ffil, open(file_e,encoding=file_enc) as efil:
        for line_f, line_e in tqdm(zip(ffil,efil),total=num_lines):
            f_sentence = line_f.split()
            e_sentence = line_e.split()
            for e in e_sentence:
                total_e[e] = 0
                for f in f_sentence:
                    total_e[e] += t[(f,e)]

            for e in e_sentence:
                for f in f_sentence:
                    temp = t[(f,e)] / total_e[e]
                    cooccurrences[(f,e)] += temp
                    total_f[f] += temp

    for f,e in tqdm(cooccurrences.keys()):
        if use_VB:
            t[(f,e)] = digamma(cooccurrences[(f,e)] + alpha) / digamma(total_f[f] + alpha)
        else:
            t[(f,e)] = cooccurrences[(f,e)] / total_f[f]
        
    return t

In [9]:
def em_step_modelII(t, q, file_f, file_e, file_enc='utf-8'):
    # Set to zero
    counts_e_f = defaultdict(float) # number of times words e and f happen together
    counts_e = defaultdict(float) # number of times word e happens
    counts_j_i = defaultdict(float) # number of times j (English) and i (French) align
    counts_i = defaultdict(float) # number of i alignments
    
    num_lines = get_num_lines(file_f)
    with open(file_f,encoding=file_enc) as ffil, open(file_e,encoding=file_enc) as efil:
        for line_f, line_e in tqdm(zip(ffil,efil),total=num_lines):
            f_sentence = line_f.split()
            e_sentence = line_e.split()
            
            # Get lengths
            l = len(e_sentence)
            m = len(f_sentence)
            
            for i in range(0, m): # french
                norm = -1
                for j in range(0, l+1): # english
                    j-= 1
                    if j == -1:
                        e = '<NULL>'
                    else:    
                        e = e_sentence[j]
                    f = f_sentence[i]
                    
                    # Compute only once per i
                    if norm == -1:
                        norm = sum(q[(x-1,i,l,m)]*t[(f,e)] for x in range(0, l+1))
                    
                    assert norm != 0, 'norm is zero. i: {}, l:{}, m:{}'.format(i,l,m)
                    delta = q[(j,i,l,m)]*t[(f,e)]/norm
                    
                    counts_e_f[(e,f)] += delta
                    counts_e[e] += delta
                    counts_j_i[(j,i,l,m)] += delta
                    counts_i[(i,l,m)] += delta
        
        for e,f in tqdm(counts_e_f.keys()):
            assert counts_e[e] != 0, 'counts_e[{}] is zero'.format(e)
            t[(f,e)] = counts_e_f[(e,f)] / counts_e[e]
        
        for j,i,l,m in tqdm(counts_j_i.keys()):
            assert counts_i[(i,l,m)] != 0, 'counts_i[({},{},{})] is zero'.format(i,l,m)
            q[(j,i,l,m)] = counts_j_i[(j,i,l,m)] / counts_i[(i,l,m)]
    
    return t, q

In [10]:
def init_params(model, initial_method, pairs, train_file_f=None, train_file_e=None, t=None):
    # Returns:
    # t[(f,e)] (model I and II)
    # q[(j,i,l,m)] (model II only)
    
    if model == IBM_model.I:
        if initial_method == Initialization_type.uniform_one:
            # All are equally likely at the beginning, prob at one
            t = dict(zip(pairs,[1]*len(pairs)))
        elif initial_method == Initialization_type.uniform_other:
            e_vocab_size = sum(1 for k,v in pairs if v != '<NULL>')
            t = dict(zip(pairs,[1.0/e_vocab_size]*len(pairs)))
        else:
            assert True, 'Unsupported initalization method {} for IBM model I'.format(initial_method)
        
        return t
    else:  
        if initial_method == Initialization_type.uniform_other:
            e_vocab_size = sum(1 for k,v in pairs if v != '<NULL>')
            t = dict(zip(pairs,[1.0/e_vocab_size]*len(pairs)))
        elif initial_method == Initialization_type.random:
            t = dict(zip(pairs,[random() for x in range(len(pairs))]))
        elif initial_method == Initialization_type.modelI and t == None:
            # Initialize t from model I output 10 iterations
            t,_,_,_ = em_algorithm(model=IBM_model.I,max_epoch=10,initial_method=initial_method,save_pickles=False)
        else:
            assert True, 'Unsupported initalization method {} for IBM model II'.format(initial_method)
        
        # Randomly initialize q
        q = {}
        with open(train_file_f,encoding=file_enc) as f, open(train_file_e,encoding=file_enc) as e:
            for line_f, line_e in tqdm(zip(f,e),total=get_num_lines(train_file_f)):
                f_sentence = line_f.split()
                e_sentence = line_e.split()
            
                # Get lengths
                l = len(e_sentence)
                m = len(f_sentence)

                for i in range(0, m): # french
                    for j in range(0, l+1): # english
                        j-= 1
                        q[(j,i,l,m)] = random()
        
        return t,q

In [11]:
def em_algorithm(model,
                 t=None, #Only used for model II
                 max_epoch=10, 
                 threshold=0.01,
                 initial_method=Initialization_type.uniform_one, #How to initialize t
                 terminate_method='aer',
                 train_file_f='data/training/hansards.36.2.f',
                 train_file_e='data/training/hansards.36.2.e',
                 validation_file_f='data/validation/dev.f',
                 validation_file_e='data/validation/dev.e',
                 validation_truth='data/validation/dev.wa.nonullalign',
                 pickles_path='data/pickles/',
                 align_path='data/alignments/',
                 save_prefix='',
                 save_pickles=True,
                 use_VB=False,
                 alpha=0.1, #Only used if VB is used
                 file_enc='utf-8'):
    
    # test if prefix exists and correct format
    if save_prefix != '' and save_prefix[-1]!='_':
        save_prefix+='_'
    
    # get word pairs from corpus
    pairs = create_pairs(train_file_f, train_file_e,file_enc=file_enc)
    
    #initialize parameters
    if model == IBM_model.I:
        t = init_params(model, initial_method, pairs)
    else:
        t, q = init_params(model, initial_method, pairs, train_file_f, train_file_e, t)
    
    # calculate initial scores before training
    align_file = os.path.join(align_path,'{0}validation_epoch{1}.align'.format(save_prefix,0))
    if model == IBM_model.I:
        create_alignments_modelI(t,
                                  validation_file_f,
                                  validation_file_e,
                                  align_file,
                                  file_enc=file_enc)
    else:
        create_alignments_modelII(t,
                                  q,
                                  validation_file_f,
                                  validation_file_e,
                                  align_file,
                                  file_enc=file_enc)

    aer = test(validation_truth, align_file)
    train_perplexity = calculate_perplexity(t,train_file_f,train_file_e,file_enc=file_enc)
    val_perplexity = calculate_perplexity(t,train_file_f,train_file_e,file_enc=file_enc)
    
    aers = [aer]
    train_perplexities = [train_perplexity]
    val_perplexities = [val_perplexity]
    #print train result
    print('INITIAL RESULTS:\n============\n AER:\n\t validation:\t{0}\n PERPLEXITY:\n\t train:\t\t{1}\n\t validation:\t{2}'.format(aer, train_perplexity, val_perplexity))
        
    # loop for max_epochs or till convergence is reached
    for epoch in range(1,max_epoch+1):
        print("start epoch: "+str(epoch))
        
        # do an EM step
        if model == IBM_model.I:
            t = em_step_modelI(t, train_file_f, train_file_e, use_VB, alpha, file_enc=file_enc)
        else:
            t, q = em_step_modelII(t, q, train_file_f, train_file_e, file_enc=file_enc)
        
        # create AER results
        align_file = os.path.join(align_path,'{0}validation_epoch{1}.align'.format(save_prefix,epoch))
        if model == IBM_model.I:
            create_alignments_modelI(t,
                                      validation_file_f,
                                      validation_file_e,
                                      align_file,
                                      file_enc=file_enc)
        else:
            create_alignments_modelII(t,
                                      q,
                                      validation_file_f,
                                      validation_file_e,
                                      align_file,
                                      file_enc=file_enc)
        
        aer = test(validation_truth, align_file)
        
        # calculate the loglikelihoods
        train_perplexity = calculate_perplexity(t,train_file_f,train_file_e,file_enc=file_enc)
        val_perplexity = calculate_perplexity(t,train_file_f,train_file_e,file_enc=file_enc)
        train_perplexities.append(train_perplexity)
        val_perplexities.append(val_perplexity)
        
        #print train result
        print('EPOCH {0}:\n============\n AER:\n\t validation:\t{1}\n PERPLEXITY:\n\t train:\t\t{2}\n\t validation:\t{3}'.format(epoch, aer, train_perplexity, val_perplexity))
        
        #store train progress
        aers.append(aer)
        if save_pickles:
            pickle.dump(t, open( os.path.join(pickles_path,'{0}t_epoch{1}.p'.format(save_prefix,epoch)), "wb" ))
            if model == IBM_model.II:
                pickle.dump(q, open( os.path.join(pickles_path,'{0}q_epoch{1}.p'.format(save_prefix,epoch)), "wb" ))
        
        #test for convergence
        if terminate_method == 'aer':
            if (len(aers) > 2) and (abs(aers[-2]-aer) < threshold):
                print('Reached Convergence!')
                break
    
    if model == IBM_model.I:
        return t,aers,train_perplexities,val_perplexities
    else:
        return t,q,aers,train_perplexities,val_perplexities

---
# RUNNING THE SCRIPT

### RUNS BY DIANA
---

In [15]:
# Run model I
t, aers, train_perplexities, val_perplexities = em_algorithm(model=IBM_model.I, max_epoch=1, save_prefix='test_model1')

In [12]:
t = pickle.load(open( "data/pickles/translation_probs_10_epochs.p", "rb" ) )
print(t['36','<NULL>'])
print(t['le', 'the'])

e_vocab_size = sum(1 for k,v in t.keys() if v != '<NULL>')
# print(e_vocab_size)

for k,v in t.items():
    if v == 1:
        t[k] = 1/e_vocab_size
    
print(t['36','<NULL>'])
print(t['le', 'the'])

1
0.5296435925241862
8.376959140376557e-08
0.5296435925241862


In [None]:
# Run model II
t, q, aers, train_perplexities, val_perplexities = em_algorithm(model=IBM_model.II, t=t, save_prefix='test_model2')

100%|█████████████████████████████████| 231164/231164 [04:01<00:00, 957.63it/s]
100%|████████████████████████████████| 231164/231164 [03:40<00:00, 1050.19it/s]
100%|█████████████████████████████████| 231164/231164 [07:30<00:00, 512.70it/s]
100%|█████████████████████████████████| 231164/231164 [11:41<00:00, 329.59it/s]


INITIAL RESULTS:
 AER:
	 validation:	0.3975448536355052
 PERPLEXITY:
	 train:		-18375501.415776275
	 validation:	-18375501.415776275
start epoch: 1


100%|████████████████████████████████| 231164/231164 [9:46:02<00:00,  6.57it/s]
100%|███████████████████████████| 11983927/11983927 [02:00<00:00, 99569.30it/s]
100%|███████████████████████████| 13484424/13484424 [04:03<00:00, 55314.28it/s]
100%|█████████████████████████████████| 231164/231164 [06:51<00:00, 562.42it/s]
100%|█████████████████████████████████| 231164/231164 [06:19<00:00, 609.40it/s]


EPOCH 1:
 AER:
	 validation:	0.2898961284230406
 PERPLEXITY:
	 train:		-21572445.43210116
	 validation:	-21572445.43210116
start epoch: 2


100%|█████████████████████████████████| 231164/231164 [36:02<00:00, 106.90it/s]
100%|██████████████████████████| 11983927/11983927 [01:32<00:00, 129387.98it/s]
100%|███████████████████████████| 13484424/13484424 [02:32<00:00, 88189.67it/s]
100%|█████████████████████████████████| 231164/231164 [06:27<00:00, 596.58it/s]
100%|█████████████████████████████████| 231164/231164 [06:32<00:00, 589.53it/s]


EPOCH 2:
 AER:
	 validation:	0.30594900849858353
 PERPLEXITY:
	 train:		-23493670.826701827
	 validation:	-23493670.826701827
start epoch: 3


100%|██████████████████████████████████| 231164/231164 [50:15<00:00, 76.67it/s]
100%|███████████████████████████| 13484424/13484424 [04:28<00:00, 50145.29it/s]
100%|█████████████████████████████████| 231164/231164 [08:39<00:00, 445.04it/s]
 27%|█████████▎                        | 63331/231164 [02:27<06:31, 428.58it/s]

### RUNS BY VICTOR
---

In [14]:
# Run model II
t, q, aers, train_perplexities, val_perplexities = em_algorithm(model=IBM_model.II, t=None, initial_method=Initialization_type.random, save_prefix='test_modelII')

100%|██████████| 231164/231164 [01:12<00:00, 3181.90it/s]
100%|██████████| 231164/231164 [01:15<00:00, 3051.54it/s]
100%|██████████| 231164/231164 [02:00<00:00, 1916.93it/s]
100%|██████████| 231164/231164 [02:00<00:00, 1922.11it/s]
  0%|          | 79/231164 [00:00<04:57, 777.82it/s]

INITIAL RESULTS:
 AER:
	 validation:	0.8762983947119924
 PERPLEXITY:
	 train:		-3266628.15805304
	 validation:	-3266628.15805304
start epoch: 1


100%|██████████| 231164/231164 [08:51<00:00, 435.21it/s]
  0%|          | 0/11983927 [00:00<?, ?it/s]

RuntimeError: dictionary changed size during iteration

In [None]:
t, q, aers, train_perplexities, val_perplexities = em_algorithm(model=IBM_model.II, t=t, save_prefix='test_modelII') # You have to input t from the output of model I